Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
| 5 |
from llama_index.core.node_parser import MarkdownNodeParser
|
| 6 |
from llama_index.core.schema import Document, MetadataMode
|
| 7 |
import textstat
|
| 8 |
-
from markitdown import
|
| 9 |
|
| 10 |
# --- Core Logic Classes ---
|
| 11 |
|
|
@@ -15,14 +15,15 @@ class WebpageContentProcessor:
|
|
| 15 |
This class is responsible for the entire content processing pipeline.
|
| 16 |
"""
|
| 17 |
def __init__(self):
|
| 18 |
-
# --- MODIFIED:
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 21 |
|
| 22 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 23 |
"""
|
| 24 |
Fetches HTML content, removes common boilerplate tags from the entire page,
|
| 25 |
-
and then converts the remaining body content to Markdown using
|
| 26 |
"""
|
| 27 |
try:
|
| 28 |
headers = {
|
|
@@ -44,10 +45,11 @@ class WebpageContentProcessor:
|
|
| 44 |
if not content_container:
|
| 45 |
return "Error: Could not find the <body> of the webpage."
|
| 46 |
|
| 47 |
-
# --- MODIFIED:
|
| 48 |
-
#
|
| 49 |
-
|
| 50 |
-
|
|
|
|
| 51 |
# -----------------------------------------------
|
| 52 |
|
| 53 |
# Post-processing to clean up the resulting Markdown
|
|
@@ -324,4 +326,4 @@ with tab2:
|
|
| 324 |
st.rerun()
|
| 325 |
|
| 326 |
st.subheader("Final Compiled Document")
|
| 327 |
-
st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
|
|
|
|
| 5 |
from llama_index.core.node_parser import MarkdownNodeParser
|
| 6 |
from llama_index.core.schema import Document, MetadataMode
|
| 7 |
import textstat
|
| 8 |
+
from markitdown import MarkItDown # <-- MODIFIED: Corrected class name casing
|
| 9 |
|
| 10 |
# --- Core Logic Classes ---
|
| 11 |
|
|
|
|
| 15 |
This class is responsible for the entire content processing pipeline.
|
| 16 |
"""
|
| 17 |
def __init__(self):
|
| 18 |
+
# --- MODIFIED: Removed the converter instantiation from init ---
|
| 19 |
+
# The MarkItDown library is instantiated per-conversion.
|
| 20 |
+
pass
|
| 21 |
+
# -----------------------------------------------------------
|
| 22 |
|
| 23 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 24 |
"""
|
| 25 |
Fetches HTML content, removes common boilerplate tags from the entire page,
|
| 26 |
+
and then converts the remaining body content to Markdown using MarkItDown.
|
| 27 |
"""
|
| 28 |
try:
|
| 29 |
headers = {
|
|
|
|
| 45 |
if not content_container:
|
| 46 |
return "Error: Could not find the <body> of the webpage."
|
| 47 |
|
| 48 |
+
# --- MODIFIED: Corrected MarkItDown usage ---
|
| 49 |
+
# Instantiate the converter directly with the HTML content.
|
| 50 |
+
# The result object's 'text' attribute holds the markdown.
|
| 51 |
+
markdown_converter_instance = MarkItDown(str(content_container))
|
| 52 |
+
markdown_output = markdown_converter_instance.text
|
| 53 |
# -----------------------------------------------
|
| 54 |
|
| 55 |
# Post-processing to clean up the resulting Markdown
|
|
|
|
| 326 |
st.rerun()
|
| 327 |
|
| 328 |
st.subheader("Final Compiled Document")
|
| 329 |
+
st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
|