Aadityaramrame commited on
Commit
8d59e0f
·
verified ·
1 Parent(s): ac40586

Create pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +41 -0
pipeline.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytesseract
2
+ from pdf2image import convert_from_path
3
+ import google.generativeai as genai
4
+ import os, json
5
+
6
+ # --- Configure Gemini ---
7
+ api_key = os.getenv("GEMINI_API_KEY")
8
+ if not api_key:
9
+ raise ValueError("❌ GEMINI_API_KEY not found. Please set it in Hugging Face Space Secrets.")
10
+ genai.configure(api_key=api_key)
11
+
12
+ def extract_text_from_pdf(pdf_path):
13
+ pages = convert_from_path(pdf_path)
14
+ text = ""
15
+ for page in pages:
16
+ text += pytesseract.image_to_string(page) + "\n"
17
+ return text.strip()
18
+
19
+ def extract_key_values_with_gemini(raw_text, fields):
20
+ prompt = f"""
21
+ You are an intelligent document parser.
22
+ Given the following document text, extract only these fields: {fields}.
23
+ Return strictly as JSON key-value pairs.
24
+ Document text:
25
+ {raw_text}
26
+ """
27
+ model = genai.GenerativeModel("models/gemini-2.5-flash")
28
+ response = model.generate_content(prompt)
29
+ text = response.text.strip()
30
+
31
+ # --- Cleanup ---
32
+ text = text.replace("```json", "").replace("```", "").strip()
33
+
34
+ try:
35
+ extracted = json.loads(text)
36
+ except Exception:
37
+ extracted = {"raw_output": text}
38
+
39
+ # --- Ensure all fields exist ---
40
+ result = {field: extracted.get(field, "") for field in fields}
41
+ return result