Splits a list of text documents into a list of text documents with shorter texts.
Splitting documents with long texts is a common preprocessing step during indexing.
This allows Embedders to create significant semantic representations
and avoids exceeding the maximum context length of language models.
Source code in dynamiq/components/splitters/document.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150 | class DocumentSplitter:
"""
Splits a list of text documents into a list of text documents with shorter texts.
Splitting documents with long texts is a common preprocessing step during indexing.
This allows Embedders to create significant semantic representations
and avoids exceeding the maximum context length of language models.
"""
def __init__(
self,
split_by: DocumentSplitBy = DocumentSplitBy.PASSAGE,
split_length: int = 10,
split_overlap: int = 0,
):
"""
Initializes an object for splitting documents into smaller parts based on specified criteria.
Args:
split_by (DocumentSplitBy): Determines the unit by which the document should be split.
Defaults to DocumentSplitBy.PASSAGE.
split_length (int): Specifies the maximum number of units to include in each split.
Defaults to 10.
split_overlap (int): Specifies the number of units that should overlap between consecutive
splits. Defaults to 0.
Raises:
ValueError: If split_length is less than or equal to 0.
ValueError: If split_overlap is less than 0.
"""
self.split_by = split_by
if split_length <= 0:
raise ValueError("split_length must be greater than 0.")
self.split_length = split_length
if split_overlap < 0:
raise ValueError("split_overlap must be greater than or equal to 0.")
self.split_overlap = split_overlap
def run(self, documents: list[Document]) -> dict:
"""
Splits the provided documents into smaller parts based on the specified configuration.
Args:
documents (list[Document]): The list of documents to be split.
Returns:
dict: A dictionary containing one key, 'documents', which is a list of the split Documents.
Raises:
TypeError: If the input is not a list of Document instances.
ValueError: If the content of any document is None.
"""
if not isinstance(documents, list) or (
documents and not isinstance(documents[0], Document)
):
raise TypeError("DocumentSplitter expects a List of Documents as input.")
split_docs = []
for doc in documents:
if doc.content is None:
raise ValueError(
f"DocumentSplitter only works with text documents but document.content for document "
f"ID {doc.id} is None."
)
units = self._split_into_units(doc.content, self.split_by)
text_splits = self._concatenate_units(
units, self.split_length, self.split_overlap
)
if doc.metadata is None:
doc.metadata = {}
metadata = deepcopy(doc.metadata)
metadata["source_id"] = doc.id
split_docs += [
Document(content=txt, metadata=metadata) for txt in text_splits
]
return {"documents": split_docs}
def _split_into_units(self, text: str, split_by: DocumentSplitBy) -> list[str]:
"""
Splits the input text into units based on the specified split_by method.
Args:
text (str): The input text to be split.
split_by (DocumentSplitBy): The method to use for splitting the text.
Returns:
list[str]: A list of text units after splitting.
"""
split_at = SPLIT_STR_BY_SPLIT_TYPE[split_by]
if split_by == DocumentSplitBy.CHARACTER:
return [char for char in text]
else:
units = text.split(split_at)
# Add the delimiter back to all units except the last one
for i in range(len(units) - 1):
if split_at == "\n#":
units[i] = "\n# " + units[i]
else:
units[i] += split_at
return units
def _concatenate_units(
self, elements: list[str], split_length: int, split_overlap: int
) -> list[str]:
"""
Concatenates the elements into parts of split_length units.
Args:
elements (list[str]): The list of text units to be concatenated.
split_length (int): The maximum number of units in each split.
split_overlap (int): The number of overlapping units between splits.
Returns:
list[str]: A list of concatenated text splits.
"""
text_splits = []
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
for seg in segments:
current_units = [unit for unit in seg if unit is not None]
txt = "".join(current_units)
if len(txt) > 0:
text_splits.append(txt)
return text_splits
|
__init__(split_by=DocumentSplitBy.PASSAGE, split_length=10, split_overlap=0)
Initializes an object for splitting documents into smaller parts based on specified criteria.
Parameters:
Name |
Type |
Description |
Default |
split_by |
DocumentSplitBy
|
Determines the unit by which the document should be split.
Defaults to DocumentSplitBy.PASSAGE.
|
PASSAGE
|
split_length |
int
|
Specifies the maximum number of units to include in each split.
Defaults to 10.
|
10
|
split_overlap |
int
|
Specifies the number of units that should overlap between consecutive
splits. Defaults to 0.
|
0
|
Raises:
Type |
Description |
ValueError
|
If split_length is less than or equal to 0.
|
ValueError
|
If split_overlap is less than 0.
|
Source code in dynamiq/components/splitters/document.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 | def __init__(
self,
split_by: DocumentSplitBy = DocumentSplitBy.PASSAGE,
split_length: int = 10,
split_overlap: int = 0,
):
"""
Initializes an object for splitting documents into smaller parts based on specified criteria.
Args:
split_by (DocumentSplitBy): Determines the unit by which the document should be split.
Defaults to DocumentSplitBy.PASSAGE.
split_length (int): Specifies the maximum number of units to include in each split.
Defaults to 10.
split_overlap (int): Specifies the number of units that should overlap between consecutive
splits. Defaults to 0.
Raises:
ValueError: If split_length is less than or equal to 0.
ValueError: If split_overlap is less than 0.
"""
self.split_by = split_by
if split_length <= 0:
raise ValueError("split_length must be greater than 0.")
self.split_length = split_length
if split_overlap < 0:
raise ValueError("split_overlap must be greater than or equal to 0.")
self.split_overlap = split_overlap
|
run(documents)
Splits the provided documents into smaller parts based on the specified configuration.
Parameters:
Name |
Type |
Description |
Default |
documents |
list[Document]
|
The list of documents to be split.
|
required
|
Returns:
Name | Type |
Description |
dict |
dict
|
A dictionary containing one key, 'documents', which is a list of the split Documents.
|
Raises:
Type |
Description |
TypeError
|
If the input is not a list of Document instances.
|
ValueError
|
If the content of any document is None.
|
Source code in dynamiq/components/splitters/document.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103 | def run(self, documents: list[Document]) -> dict:
"""
Splits the provided documents into smaller parts based on the specified configuration.
Args:
documents (list[Document]): The list of documents to be split.
Returns:
dict: A dictionary containing one key, 'documents', which is a list of the split Documents.
Raises:
TypeError: If the input is not a list of Document instances.
ValueError: If the content of any document is None.
"""
if not isinstance(documents, list) or (
documents and not isinstance(documents[0], Document)
):
raise TypeError("DocumentSplitter expects a List of Documents as input.")
split_docs = []
for doc in documents:
if doc.content is None:
raise ValueError(
f"DocumentSplitter only works with text documents but document.content for document "
f"ID {doc.id} is None."
)
units = self._split_into_units(doc.content, self.split_by)
text_splits = self._concatenate_units(
units, self.split_length, self.split_overlap
)
if doc.metadata is None:
doc.metadata = {}
metadata = deepcopy(doc.metadata)
metadata["source_id"] = doc.id
split_docs += [
Document(content=txt, metadata=metadata) for txt in text_splits
]
return {"documents": split_docs}
|