html_utils

`toolkitx.html_utils`

Functions

`html_to_markdown(html, handle_nested_tables='json', use_first_row_as_header=True, **kwargs)`

Converts HTML to Markdown with robust table support.

Nested tables are converted to JSON strings to maintain structure in Markdown cells. Merged cells (colspan/rowspan) are expanded. If no header is found, the first row is used as a header by default.

Parameters:

is found, use the first row as header.

Name	Type	Description	Default
`html`	`str`	Input HTML string.	required
`handle_nested_tables`	`str`	How to handle nested tables ('json' or 'unwrap').	`'json'`
`use_first_row_as_header`	`bool`	If True and no	/
`True`
`**kwargs`		Additional arguments for markdownify.	`{}`

Examples:

Standard table conversion (first row promoted to header by default)

>>> html = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Data</td><td>Val</td></tr></table>"
>>> print(html_to_markdown(html))
| Cell 1 | Cell 2 |
| --- | --- |
| Data | Val |

Disable automatic header promotion

>>> print(html_to_markdown(html, use_first_row_as_header=False))
|  |  |
| --- | --- |
| Cell 1 | Cell 2 |
| Data | Val |

Colspan expansion

>>> html_colspan = "<table><tr><td colspan='2'>Merged</td><td>Normal</td></tr></table>"
>>> print(html_to_markdown(html_colspan))
| Merged | Merged | Normal |
| --- | --- | --- |

Rowspan expansion

>>> html_rowspan = "<table><tr><td rowspan='2'>Rows</td><td>A</td></tr><tr><td>B</td></tr></table>"
>>> print(html_to_markdown(html_rowspan))
| Rows | A |
| --- | --- |
| Rows | B |

Source code in toolkitx/html_utils.py

def html_to_markdown(
    html: str, 
    handle_nested_tables: str = "json", 
    use_first_row_as_header: bool = True,
    **kwargs
) -> str:
    """
    Converts HTML to Markdown with robust table support.

    Nested tables are converted to JSON strings to maintain structure in Markdown cells.
    Merged cells (colspan/rowspan) are expanded.
    If no header is found, the first row is used as a header by default.

    Args:
        html: Input HTML string.
        handle_nested_tables: How to handle nested tables ('json' or 'unwrap').
        use_first_row_as_header: If True and no <th>/<thead> is found, use the first row as header.
        **kwargs: Additional arguments for markdownify.

    Examples:
        # Standard table conversion (first row promoted to header by default)
        >>> html = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Data</td><td>Val</td></tr></table>"
        >>> print(html_to_markdown(html))
        | Cell 1 | Cell 2 |
        | --- | --- |
        | Data | Val |

        # Disable automatic header promotion
        >>> print(html_to_markdown(html, use_first_row_as_header=False))
        |  |  |
        | --- | --- |
        | Cell 1 | Cell 2 |
        | Data | Val |

        # Colspan expansion
        >>> html_colspan = "<table><tr><td colspan='2'>Merged</td><td>Normal</td></tr></table>"
        >>> print(html_to_markdown(html_colspan))
        | Merged | Merged | Normal |
        | --- | --- | --- |

        # Rowspan expansion
        >>> html_rowspan = "<table><tr><td rowspan='2'>Rows</td><td>A</td></tr><tr><td>B</td></tr></table>"
        >>> print(html_to_markdown(html_rowspan))
        | Rows | A |
        | --- | --- |
        | Rows | B |
    """
    soup = BeautifulSoup(html, "html.parser")

    # Process tables innermost-first
    while True:
        tables = soup.find_all("table")
        if not tables:
            break

        # Find a table that doesn't contain other tables
        innermost_table = None
        for t in tables:
            if not t.find("table"):
                innermost_table = t
                break

        if not innermost_table:
            innermost_table = tables[0]

        # If this table is inside another table, convert it to JSON
        if innermost_table.find_parent("table"):
            if handle_nested_tables == "json":
                # Expand nested table before JSON-ification
                _expand_table_cells(innermost_table)

                rows_data = []
                for tr in _get_table_rows(innermost_table):
                    row = []
                    for cell in tr.find_all(["td", "th"], recursive=False):
                        content_html = "".join(str(c) for c in cell.contents).strip()
                        cell_md = md(content_html, **kwargs).strip()
                        if '\"' in cell_md:
                            cell_md = re.sub(r'(\[[^\]]*\]\([^)]*)\"([^)]*)\"', r"\1'\2'", cell_md)
                            cell_md = re.sub(r'(\!\[[^\]]*\]\([^)]*)\"([^)]*)\"', r"\1'\2'", cell_md)
                        row.append(cell_md)
                    rows_data.append(row)

                json_str = json.dumps(rows_data, ensure_ascii=False)
                new_tag = soup.new_tag("code")
                new_tag.string = json_str
                innermost_table.replace_with(new_tag)
            else:
                innermost_table.unwrap() 
        else:
            # Top-level table: Expand it
            _expand_table_cells(innermost_table)

            # Fix: If no <th> or <thead> exists, promote the first row to header
            if use_first_row_as_header:
                has_header = innermost_table.find(["th", "thead"])
                if not has_header:
                    rows = _get_table_rows(innermost_table)
                    if rows:
                        first_row = rows[0]
                        for cell in first_row.find_all("td", recursive=False):
                            cell.name = "th"

            # Mark it so we don't process it again in this loop
            innermost_table.name = "processed_table"

    # Restore tag names
    for t in soup.find_all("processed_table"):
        t.name = "table"

    # Ensure images are kept in table cells
    if "keep_inline_images_in" not in kwargs:
        kwargs["keep_inline_images_in"] = ["td", "th"]

    return md(str(soup), **kwargs).strip()