Bases: Node
Splits HTML on header tags (h1..h6) and carries the header path in metadata.
Requires the optional beautifulsoup4 package (lxml recommended).
Source code in dynamiq/nodes/splitters/html.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 | class HTMLHeaderSplitter(Node):
"""Splits HTML on header tags (``h1``..``h6``) and carries the header path in metadata.
Requires the optional ``beautifulsoup4`` package (``lxml`` recommended).
"""
group: Literal[NodeGroup.SPLITTERS] = NodeGroup.SPLITTERS
name: str = "HTMLHeaderSplitter"
description: str = "Splits HTML on header tags and propagates header path to metadata."
headers_to_split_on: list[tuple[str, str]] = Field(
default_factory=lambda: [("h1", "h1"), ("h2", "h2"), ("h3", "h3"), ("h4", "h4")],
description="Pairs of (html-tag, metadata-key).",
)
return_each_element: bool = Field(default=False, description="Emit one chunk per element.")
splitter: Any | None = None
input_schema: ClassVar[type[HTMLSplitterInputSchema]] = HTMLSplitterInputSchema
@property
def to_dict_exclude_params(self) -> dict[str, Any]:
return super().to_dict_exclude_params | {"splitter": True}
def init_components(self, connection_manager: ConnectionManager | None = None) -> None:
connection_manager = connection_manager or ConnectionManager()
super().init_components(connection_manager)
if self.splitter is None:
self.splitter = HTMLHeaderSplitterComponent(
headers_to_split_on=[tuple(pair) for pair in self.headers_to_split_on],
return_each_element=self.return_each_element,
)
def execute(self, input_data: HTMLSplitterInputSchema, config: RunnableConfig = None, **kwargs) -> dict[str, Any]:
config = ensure_config(config)
self.run_on_node_execute_run(config.callbacks, **kwargs)
documents = input_data.documents
logger.debug(f"HTMLHeaderSplitter: splitting {len(documents)} documents.")
output = self.splitter.run(documents=documents)
return {"documents": output["documents"]}
|