Francisco Zanartu commited on
Commit
462129a
·
1 Parent(s): 81b8253

feat: add parser utilities for cleaning Markdown and encoding PDFs to base64

Browse files
Files changed (2) hide show
  1. src/utils/parser.py +0 -54
  2. src/utils/parser_utils.py +20 -0
src/utils/parser.py DELETED
@@ -1,54 +0,0 @@
1
- import os
2
- from pathlib import Path
3
- from dotenv import load_dotenv
4
- from google import genai
5
- from google.genai import types
6
-
7
- load_dotenv()
8
- key = os.getenv("GEMINI_API_KEY")
9
-
10
- client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
11
-
12
- RAW_DIR = Path("data/raw")
13
- MD_DIR = Path("data/md")
14
- MD_DIR.mkdir(parents=True, exist_ok=True)
15
-
16
- SYSTEM_PROMPT = """Provide a verbatim transcription of this document into Markdown.
17
- - Preserve all content exactly (no summarizing or rewriting)
18
- - Keep the original structure using Markdown headers
19
- - Preserve lists, emphasis, links, tables, code blocks
20
- - Output Markdown only
21
- """
22
-
23
-
24
- class MarkdownConverter:
25
- def __init__(self, model="gemini-2.5-pro"):
26
- self.model = model
27
-
28
- def run(self):
29
- for i, pdf in enumerate(RAW_DIR.glob("*.pdf")):
30
- out = MD_DIR / f"{pdf.stem}.md"
31
- if out.exists():
32
- continue
33
-
34
- uploaded = client.files.upload(file=pdf)
35
-
36
- try:
37
- response = client.models.generate_content(
38
- model=self.model,
39
- contents=[SYSTEM_PROMPT, uploaded],
40
- config=types.GenerateContentConfig(
41
- temperature=0,
42
- max_output_tokens=65536,
43
- ),
44
- )
45
-
46
- out.write_text(response.text, encoding="utf-8")
47
- print(f"{i}. {pdf.stem} converted")
48
-
49
- finally:
50
- client.files.delete(name=uploaded.name)
51
-
52
-
53
- if __name__ == "__main__":
54
- MarkdownConverter().run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils/parser_utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import base64
3
+
4
+
5
+ def clean_markdown(text):
6
+ """
7
+ Removes the ```markdown and ``` wrappers that LLMs often include.
8
+ """
9
+ # Remove leading ```markdown or ```
10
+ text = re.sub(r"^```(?:markdown)?\n?", "", text, flags=re.IGNORECASE)
11
+ # Remove trailing ```
12
+ text = re.sub(r"\n?```$", "", text)
13
+ return text.strip()
14
+
15
+
16
+ def encode_pdf_to_base64(file_path):
17
+ """Helper to convert local file to base64 string."""
18
+ with open(file_path, "rb") as f:
19
+ encoded_string = base64.b64encode(f.read()).decode("utf-8")
20
+ return encoded_string