Spaces:
Sleeping
Sleeping
| """ | |
| Intelligent text cleaner using Gemini to format raw OCR and PDF extractions perfectly. | |
| """ | |
| import time | |
| import config | |
| try: | |
| import google.generativeai as genai | |
| GEMINI_AVAILABLE = True | |
| except ImportError: | |
| GEMINI_AVAILABLE = False | |
| def clean_format_text(raw_text: str) -> str: | |
| """Pass raw extracted text through Gemini to clean formatting and add markdown structure without missing words.""" | |
| if not config.is_gemini_available() or not GEMINI_AVAILABLE: | |
| return raw_text | |
| # Skip if text is extremely short | |
| if len(raw_text.strip()) < 50: | |
| return raw_text | |
| try: | |
| genai.configure(api_key=config.GEMINI_API_KEY) | |
| model = genai.GenerativeModel(config.GEMINI_MODEL_NAME) | |
| prompt = ( | |
| "You are a master document formatting assistant. Your task is to clean up and perfectly format the raw extracted text below into a structured and topic-wise format.\n\n" | |
| "CRITICAL INSTRUCTIONS:\n" | |
| "1. You MUST preserve EVERY SINGLE WORD and detail from the original text. Do not summarize, skip, or rephrase anything. No information loss is acceptable.\n" | |
| "2. Organize all content logically into structured, thematic topics (topic-wise). Apply bold markdown headers (e.g. **Contact Information**, **Experience**, **Summary**, or other relevant topics) and use proper bullet points.\n" | |
| "3. Fix arbitrary broken line-breaks (typical OCR artifacts) and stitch sentences back together naturally.\n" | |
| "4. Return ONLY the perfectly formatted text. Do not include any JSON wrapping or conversational preamble.\n\n" | |
| "RAW TEXT:\n" | |
| ) | |
| # We don't use JSON response here, we just want plain formatted text | |
| response = model.generate_content(prompt + raw_text) | |
| if response.text and len(response.text.strip()) > 0: | |
| return response.text.strip() | |
| except Exception as e: | |
| print(f"Intelligent formatting failed, falling back to raw: {e}") | |
| return raw_text | |