Spaces:

fzanartu
/

fhoc

Runtime error

App Files Files Community

Francisco Zanartu commited on Feb 17

Commit

462129a

1 Parent(s): 81b8253

feat: add parser utilities for cleaning Markdown and encoding PDFs to base64

Browse files

Files changed (2) hide show

src/utils/parser.py +0 -54
src/utils/parser_utils.py +20 -0

src/utils/parser.py DELETED Viewed

@@ -1,54 +0,0 @@
-import os
-from pathlib import Path
-from dotenv import load_dotenv
-from google import genai
-from google.genai import types
-load_dotenv()
-key = os.getenv("GEMINI_API_KEY")
-client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
-RAW_DIR = Path("data/raw")
-MD_DIR = Path("data/md")
-MD_DIR.mkdir(parents=True, exist_ok=True)
-SYSTEM_PROMPT = """Provide a verbatim transcription of this document into Markdown.
-- Preserve all content exactly (no summarizing or rewriting)
-- Keep the original structure using Markdown headers
-- Preserve lists, emphasis, links, tables, code blocks
-- Output Markdown only
-"""
-class MarkdownConverter:
-    def __init__(self, model="gemini-2.5-pro"):
-        self.model = model
-    def run(self):
-        for i, pdf in enumerate(RAW_DIR.glob("*.pdf")):
-            out = MD_DIR / f"{pdf.stem}.md"
-            if out.exists():
-                continue
-            uploaded = client.files.upload(file=pdf)
-            try:
-                response = client.models.generate_content(
-                    model=self.model,
-                    contents=[SYSTEM_PROMPT, uploaded],
-                    config=types.GenerateContentConfig(
-                        temperature=0,
-                        max_output_tokens=65536,
-                    ),
-                )
-                out.write_text(response.text, encoding="utf-8")
-                print(f"{i}. {pdf.stem} converted")
-            finally:
-                client.files.delete(name=uploaded.name)
-if __name__ == "__main__":
-    MarkdownConverter().run()

src/utils/parser_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import re
+import base64
+def clean_markdown(text):
+    """
+    Removes the ```markdown and ``` wrappers that LLMs often include.
+    """
+    # Remove leading ```markdown or ```
+    text = re.sub(r"^```(?:markdown)?\n?", "", text, flags=re.IGNORECASE)
+    # Remove trailing ```
+    text = re.sub(r"\n?```$", "", text)
+    return text.strip()
+def encode_pdf_to_base64(file_path):
+    """Helper to convert local file to base64 string."""
+    with open(file_path, "rb") as f:
+        encoded_string = base64.b64encode(f.read()).decode("utf-8")
+    return encoded_string