Spaces:
Sleeping
Sleeping
| import pytesseract | |
| from pdf2image import convert_from_path | |
| import google.generativeai as genai | |
| import os, json | |
| # --- Configure Gemini --- | |
| api_key = os.getenv("GEMINI_API_KEY") | |
| if not api_key: | |
| raise ValueError("❌ GEMINI_API_KEY not found. Please set it in Hugging Face Space Secrets.") | |
| genai.configure(api_key=api_key) | |
| def extract_text_from_pdf(pdf_path): | |
| pages = convert_from_path(pdf_path) | |
| text = "" | |
| for page in pages: | |
| text += pytesseract.image_to_string(page) + "\n" | |
| return text.strip() | |
| def extract_key_values_with_gemini(raw_text, fields): | |
| prompt = f""" | |
| You are an intelligent document parser. | |
| Given the following document text, extract only these fields: {fields}. | |
| Return strictly as JSON key-value pairs. | |
| Document text: | |
| {raw_text} | |
| """ | |
| model = genai.GenerativeModel("models/gemini-2.5-flash") | |
| response = model.generate_content(prompt) | |
| text = response.text.strip() | |
| # --- Cleanup --- | |
| text = text.replace("```json", "").replace("```", "").strip() | |
| try: | |
| extracted = json.loads(text) | |
| except Exception: | |
| extracted = {"raw_output": text} | |
| # --- Ensure all fields exist --- | |
| result = {field: extracted.get(field, "") for field in fields} | |
| return result | |