Em4e commited on
Commit
259dab0
·
verified ·
1 Parent(s): 98ddca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -10
app.py CHANGED
@@ -5,7 +5,7 @@ import re
5
  from llama_index.core.node_parser import MarkdownNodeParser
6
  from llama_index.core.schema import Document, MetadataMode
7
  import textstat
8
- from markitdown import Markitdown # <-- MODIFIED: Import Markitdown
9
 
10
  # --- Core Logic Classes ---
11
 
@@ -15,14 +15,15 @@ class WebpageContentProcessor:
15
  This class is responsible for the entire content processing pipeline.
16
  """
17
  def __init__(self):
18
- # --- MODIFIED: Instantiate Markitdown converter ---
19
- self.markdown_converter = Markitdown()
20
- # -------------------------------------------------
 
21
 
22
  def fetch_and_convert_to_markdown(self, url: str) -> str:
23
  """
24
  Fetches HTML content, removes common boilerplate tags from the entire page,
25
- and then converts the remaining body content to Markdown using Markitdown.
26
  """
27
  try:
28
  headers = {
@@ -44,10 +45,11 @@ class WebpageContentProcessor:
44
  if not content_container:
45
  return "Error: Could not find the <body> of the webpage."
46
 
47
- # --- MODIFIED: Use Markitdown for conversion ---
48
- # The .convert() method returns an object; the HTML is in the .text attribute
49
- conversion_result = self.markdown_converter.convert(str(content_container))
50
- markdown_output = conversion_result.text
 
51
  # -----------------------------------------------
52
 
53
  # Post-processing to clean up the resulting Markdown
@@ -324,4 +326,4 @@ with tab2:
324
  st.rerun()
325
 
326
  st.subheader("Final Compiled Document")
327
- st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
 
5
  from llama_index.core.node_parser import MarkdownNodeParser
6
  from llama_index.core.schema import Document, MetadataMode
7
  import textstat
8
+ from markitdown import MarkItDown # <-- MODIFIED: Corrected class name casing
9
 
10
  # --- Core Logic Classes ---
11
 
 
15
  This class is responsible for the entire content processing pipeline.
16
  """
17
  def __init__(self):
18
+ # --- MODIFIED: Removed the converter instantiation from init ---
19
+ # The MarkItDown library is instantiated per-conversion.
20
+ pass
21
+ # -----------------------------------------------------------
22
 
23
  def fetch_and_convert_to_markdown(self, url: str) -> str:
24
  """
25
  Fetches HTML content, removes common boilerplate tags from the entire page,
26
+ and then converts the remaining body content to Markdown using MarkItDown.
27
  """
28
  try:
29
  headers = {
 
45
  if not content_container:
46
  return "Error: Could not find the <body> of the webpage."
47
 
48
+ # --- MODIFIED: Corrected MarkItDown usage ---
49
+ # Instantiate the converter directly with the HTML content.
50
+ # The result object's 'text' attribute holds the markdown.
51
+ markdown_converter_instance = MarkItDown(str(content_container))
52
+ markdown_output = markdown_converter_instance.text
53
  # -----------------------------------------------
54
 
55
  # Post-processing to clean up the resulting Markdown
 
326
  st.rerun()
327
 
328
  st.subheader("Final Compiled Document")
329
+ st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")