Skip to content

Document

DocumentSplitBy

Bases: str, Enum

Enum class for document splitting methods.

Source code in dynamiq/components/splitters/document.py
 7
 8
 9
10
11
12
13
14
15
class DocumentSplitBy(str, enum.Enum):
    """Enum class for document splitting methods."""

    WORD = "word"
    SENTENCE = "sentence"
    PAGE = "page"
    PASSAGE = "passage"
    TITLE = "title"
    CHARACTER = "character"

DocumentSplitter

Splits a list of text documents into a list of text documents with shorter texts.

Splitting documents with long texts is a common preprocessing step during indexing. This allows Embedders to create significant semantic representations and avoids exceeding the maximum context length of language models.

Source code in dynamiq/components/splitters/document.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class DocumentSplitter:
    """
    Splits a list of text documents into a list of text documents with shorter texts.

    Splitting documents with long texts is a common preprocessing step during indexing.
    This allows Embedders to create significant semantic representations
    and avoids exceeding the maximum context length of language models.
    """

    def __init__(
        self,
        split_by: DocumentSplitBy = DocumentSplitBy.PASSAGE,
        split_length: int = 10,
        split_overlap: int = 0,
    ):
        """
        Initializes an object for splitting documents into smaller parts based on specified criteria.

        Args:
            split_by (DocumentSplitBy): Determines the unit by which the document should be split.
                Defaults to DocumentSplitBy.PASSAGE.
            split_length (int): Specifies the maximum number of units to include in each split.
                Defaults to 10.
            split_overlap (int): Specifies the number of units that should overlap between consecutive
                splits. Defaults to 0.

        Raises:
            ValueError: If split_length is less than or equal to 0.
            ValueError: If split_overlap is less than 0.
        """
        self.split_by = split_by
        if split_length <= 0:
            raise ValueError("split_length must be greater than 0.")
        self.split_length = split_length
        if split_overlap < 0:
            raise ValueError("split_overlap must be greater than or equal to 0.")
        self.split_overlap = split_overlap

    def run(self, documents: list[Document]) -> dict:
        """
        Splits the provided documents into smaller parts based on the specified configuration.

        Args:
            documents (list[Document]): The list of documents to be split.

        Returns:
            dict: A dictionary containing one key, 'documents', which is a list of the split Documents.

        Raises:
            TypeError: If the input is not a list of Document instances.
            ValueError: If the content of any document is None.
        """
        if not isinstance(documents, list) or (
            documents and not isinstance(documents[0], Document)
        ):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")

        split_docs = []
        for doc in documents:
            if doc.content is None:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but document.content for document "
                    f"ID {doc.id} is None."
                )
            units = self._split_into_units(doc.content, self.split_by)
            text_splits = self._concatenate_units(
                units, self.split_length, self.split_overlap
            )
            if doc.metadata is None:
                doc.metadata = {}
            metadata = deepcopy(doc.metadata)
            metadata["source_id"] = doc.id
            split_docs += [
                Document(content=txt, metadata=metadata) for txt in text_splits
            ]
        return {"documents": split_docs}

    def _split_into_units(self, text: str, split_by: DocumentSplitBy) -> list[str]:
        """
        Splits the input text into units based on the specified split_by method.

        Args:
            text (str): The input text to be split.
            split_by (DocumentSplitBy): The method to use for splitting the text.

        Returns:
            list[str]: A list of text units after splitting.
        """
        split_at = SPLIT_STR_BY_SPLIT_TYPE[split_by]
        if split_by == DocumentSplitBy.CHARACTER:
            return [char for char in text]
        else:
            units = text.split(split_at)
        # Add the delimiter back to all units except the last one
        for i in range(len(units) - 1):
            if split_at == "\n#":
                units[i] = "\n# " + units[i]
            else:
                units[i] += split_at
        return units

    def _concatenate_units(
        self, elements: list[str], split_length: int, split_overlap: int
    ) -> list[str]:
        """
        Concatenates the elements into parts of split_length units.

        Args:
            elements (list[str]): The list of text units to be concatenated.
            split_length (int): The maximum number of units in each split.
            split_overlap (int): The number of overlapping units between splits.

        Returns:
            list[str]: A list of concatenated text splits.
        """
        text_splits = []
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)
            if len(txt) > 0:
                text_splits.append(txt)
        return text_splits

__init__(split_by=DocumentSplitBy.PASSAGE, split_length=10, split_overlap=0)

Initializes an object for splitting documents into smaller parts based on specified criteria.

Parameters:

Name Type Description Default
split_by DocumentSplitBy

Determines the unit by which the document should be split. Defaults to DocumentSplitBy.PASSAGE.

PASSAGE
split_length int

Specifies the maximum number of units to include in each split. Defaults to 10.

10
split_overlap int

Specifies the number of units that should overlap between consecutive splits. Defaults to 0.

0

Raises:

Type Description
ValueError

If split_length is less than or equal to 0.

ValueError

If split_overlap is less than 0.

Source code in dynamiq/components/splitters/document.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __init__(
    self,
    split_by: DocumentSplitBy = DocumentSplitBy.PASSAGE,
    split_length: int = 10,
    split_overlap: int = 0,
):
    """
    Initializes an object for splitting documents into smaller parts based on specified criteria.

    Args:
        split_by (DocumentSplitBy): Determines the unit by which the document should be split.
            Defaults to DocumentSplitBy.PASSAGE.
        split_length (int): Specifies the maximum number of units to include in each split.
            Defaults to 10.
        split_overlap (int): Specifies the number of units that should overlap between consecutive
            splits. Defaults to 0.

    Raises:
        ValueError: If split_length is less than or equal to 0.
        ValueError: If split_overlap is less than 0.
    """
    self.split_by = split_by
    if split_length <= 0:
        raise ValueError("split_length must be greater than 0.")
    self.split_length = split_length
    if split_overlap < 0:
        raise ValueError("split_overlap must be greater than or equal to 0.")
    self.split_overlap = split_overlap

run(documents)

Splits the provided documents into smaller parts based on the specified configuration.

Parameters:

Name Type Description Default
documents list[Document]

The list of documents to be split.

required

Returns:

Name Type Description
dict dict

A dictionary containing one key, 'documents', which is a list of the split Documents.

Raises:

Type Description
TypeError

If the input is not a list of Document instances.

ValueError

If the content of any document is None.

Source code in dynamiq/components/splitters/document.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def run(self, documents: list[Document]) -> dict:
    """
    Splits the provided documents into smaller parts based on the specified configuration.

    Args:
        documents (list[Document]): The list of documents to be split.

    Returns:
        dict: A dictionary containing one key, 'documents', which is a list of the split Documents.

    Raises:
        TypeError: If the input is not a list of Document instances.
        ValueError: If the content of any document is None.
    """
    if not isinstance(documents, list) or (
        documents and not isinstance(documents[0], Document)
    ):
        raise TypeError("DocumentSplitter expects a List of Documents as input.")

    split_docs = []
    for doc in documents:
        if doc.content is None:
            raise ValueError(
                f"DocumentSplitter only works with text documents but document.content for document "
                f"ID {doc.id} is None."
            )
        units = self._split_into_units(doc.content, self.split_by)
        text_splits = self._concatenate_units(
            units, self.split_length, self.split_overlap
        )
        if doc.metadata is None:
            doc.metadata = {}
        metadata = deepcopy(doc.metadata)
        metadata["source_id"] = doc.id
        split_docs += [
            Document(content=txt, metadata=metadata) for txt in text_splits
        ]
    return {"documents": split_docs}