Skip to content

Auto

AutoSplitter

Bases: BaseSplitterNode

Routes each Document to the most suitable splitter based on metadata and content.

Source code in dynamiq/nodes/splitters/auto.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class AutoSplitter(BaseSplitterNode):
    """Routes each Document to the most suitable splitter based on metadata and content."""

    component_cls: ClassVar[type] = AutoSplitterComponent

    name: str = "AutoSplitter"
    description: str = "Routes documents to structure-aware splitters and falls back to recursive splitting."

    rules: list[AutoSplitterRule] = Field(
        default_factory=_default_rules,
        description="Serializable rules used to select splitter strategies.",
    )
    fallback_strategy: AutoSplitterStrategy = Field(
        default=AutoSplitterStrategy.RECURSIVE_CHARACTER,
        description="Strategy used when no rule or inference matches.",
    )
    fallback_on_error: bool = Field(default=True, description="Fallback to fallback_strategy on inferred route errors.")
    infer_from_content: bool = Field(default=True, description="Infer strategy from lightweight content sniffing.")
    add_splitter_metadata: bool = Field(default=True, description="Stamp selected splitter strategy into metadata.")
    splitter_metadata_key: str = Field(default="splitter_strategy", description="Metadata key for selected strategy.")

    json_max_chunk_size: int = Field(default=2000, gt=0, description="Maximum serialized JSON chunk size.")
    json_min_chunk_size: int | None = Field(default=None, description="Minimum serialized JSON chunk size.")
    json_convert_lists: bool = Field(default=False, description="Convert JSON lists to indexed dicts before splitting.")

    markdown_headers_to_split_on: list[tuple[str, str]] | None = Field(
        default=None,
        description="Pairs of (markdown-prefix, metadata-key) for MarkdownHeaderSplitter.",
    )
    markdown_strip_headers: bool = Field(default=True, description="Drop Markdown header lines from chunks.")
    markdown_return_each_line: bool = Field(default=False, description="Emit one Markdown chunk per non-empty line.")

    html_headers_to_split_on: list[tuple[str, str]] | None = Field(
        default=None,
        description="Pairs of (html-tag, metadata-key) for HTML splitters.",
    )
    html_return_each_element: bool = Field(default=False, description="Emit one HTML chunk per element.")
    html_xpath_filter: str | None = Field(
        default=None, description="Optional XPath used to scope HTML section splitting."
    )

    code_parser: CodeParser = Field(default=CodeParser.REGEX, description="Parser to use for code splitting.")
    code_default_language: Language = Field(
        default=Language.PYTHON, description="Fallback language for code splitting."
    )

    def _component_kwargs(self) -> dict[str, Any]:
        kwargs = super()._component_kwargs()
        kwargs.update(
            rules=self.rules,
            fallback_strategy=self.fallback_strategy,
            fallback_on_error=self.fallback_on_error,
            infer_from_content=self.infer_from_content,
            add_splitter_metadata=self.add_splitter_metadata,
            splitter_metadata_key=self.splitter_metadata_key,
            json_max_chunk_size=self.json_max_chunk_size,
            json_min_chunk_size=self.json_min_chunk_size,
            json_convert_lists=self.json_convert_lists,
            markdown_headers_to_split_on=self.markdown_headers_to_split_on,
            markdown_strip_headers=self.markdown_strip_headers,
            markdown_return_each_line=self.markdown_return_each_line,
            html_headers_to_split_on=self.html_headers_to_split_on,
            html_return_each_element=self.html_return_each_element,
            html_xpath_filter=self.html_xpath_filter,
            code_parser=self.code_parser,
            code_default_language=self.code_default_language,
        )
        return kwargs