Spaces:
Sleeping
Sleeping
| # app.py -- Backend v5 (Gemini -> JSON -> Advanced DOCX) | |
| # Universal Document Enhancer - Works for ANY document type | |
| import os | |
| import io | |
| import json | |
| import traceback | |
| import unicodedata | |
| import re | |
| from fastapi import FastAPI, File, UploadFile, Form | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import JSONResponse, StreamingResponse | |
| import fitz # pymupdf | |
| from docx import Document | |
| from docx.shared import Pt, Inches | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| import requests | |
| app = FastAPI(title="Document Enhancer v5") | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| GEMINI_API_KEY1 = os.getenv("GEMINI_API_KEY1") | |
| GEMINI_API_KEY2 = os.getenv("GEMINI_API_KEY2") | |
| GEMINI_API_KEY3 = os.getenv("GEMINI_API_KEY3") | |
| GEMINI_API_KEY4 = os.getenv("GEMINI_API_KEY4") | |
| GEMINI_API_KEY5 = os.getenv("GEMINI_API_KEY5") | |
| # Build list of available API keys | |
| GEMINI_API_KEYS = [ | |
| key for key in [ | |
| GEMINI_API_KEY, | |
| GEMINI_API_KEY1, | |
| GEMINI_API_KEY2, | |
| GEMINI_API_KEY3, | |
| GEMINI_API_KEY4, | |
| GEMINI_API_KEY5 | |
| ] if key | |
| ] | |
| GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent" | |
| # ------------------------- | |
| # Utility: Sanitize filename for HTTP headers | |
| # ------------------------- | |
| def sanitize_filename(filename: str) -> str: | |
| """ | |
| Sanitize filename to be safe for HTTP Content-Disposition header. | |
| Only allows ASCII characters safe for latin-1 encoding. | |
| """ | |
| if not filename: | |
| return "document.docx" | |
| # Remove file extension first | |
| name_without_ext = filename.rsplit('.', 1)[0] if '.' in filename else filename | |
| # Convert to ASCII, removing/replacing non-ASCII characters | |
| # This will convert – to -, é to e, etc. | |
| try: | |
| ascii_name = name_without_ext.encode('ascii', 'ignore').decode('ascii') | |
| except: | |
| ascii_name = "document" | |
| # If nothing left after ASCII conversion, use default | |
| if not ascii_name or not ascii_name.strip(): | |
| ascii_name = "document" | |
| # Remove any remaining problematic characters (keep only alphanumeric, spaces, hyphens, underscores) | |
| safe_name = re.sub(r'[^\w\s\-]', '', ascii_name) | |
| # Replace multiple spaces/hyphens with single underscore | |
| safe_name = re.sub(r'[\s\-]+', '_', safe_name) | |
| # Trim and ensure not empty | |
| safe_name = safe_name.strip('_') or "document" | |
| # Add .docx extension | |
| return f"{safe_name}.docx" | |
| # ------------------------- | |
| # Utility: Repair truncated JSON | |
| # ------------------------- | |
| def repair_truncated_json(json_str: str) -> str: | |
| """ | |
| Attempt to repair truncated JSON by closing open structures. | |
| """ | |
| try: | |
| # Count open/close brackets | |
| open_braces = json_str.count('{') | |
| close_braces = json_str.count('}') | |
| open_brackets = json_str.count('[') | |
| close_brackets = json_str.count(']') | |
| # If JSON is truncated mid-string, remove the incomplete part | |
| if json_str.rstrip().endswith('"'): | |
| # Find the last complete object | |
| last_complete = json_str.rfind('"}') | |
| if last_complete != -1: | |
| json_str = json_str[:last_complete + 2] | |
| else: | |
| # Remove trailing incomplete text | |
| last_quote = json_str.rfind('"') | |
| if last_quote != -1: | |
| # Check if this quote is part of an incomplete string | |
| after_quote = json_str[last_quote + 1:].strip() | |
| if after_quote and after_quote[0] not in [',', '}', ']']: | |
| # Incomplete string, remove it | |
| prev_comma = json_str.rfind(',', 0, last_quote) | |
| if prev_comma != -1: | |
| json_str = json_str[:prev_comma] | |
| # Close any open arrays | |
| for _ in range(open_brackets - close_brackets): | |
| json_str += ']' | |
| # Close any open objects | |
| for _ in range(open_braces - close_braces): | |
| json_str += '}' | |
| return json_str | |
| except: | |
| return json_str | |
| # ------------------------- | |
| # Utility: Clean Gemini JSON | |
| # ------------------------- | |
| def clean_gemini_json(raw_text: str) -> str: | |
| """ | |
| Removes markdown code fences and wrappers from Gemini output so it becomes | |
| valid JSON text for json.loads(). | |
| """ | |
| if not raw_text: | |
| return raw_text | |
| cleaned = raw_text.strip() | |
| # Remove triple-backticks blocks like ```json ... ``` | |
| if cleaned.startswith("```"): | |
| # remove first fence and any language marker | |
| first_line_end = cleaned.find("\n") | |
| if first_line_end != -1: | |
| cleaned = cleaned[first_line_end + 1 :] | |
| # strip trailing ``` | |
| if cleaned.endswith("```"): | |
| cleaned = cleaned[: -3] | |
| # Remove single-line wrappers like: json: { ... } | |
| # Remove leading words until first brace | |
| first_brace = cleaned.find("{") | |
| if first_brace > 0: | |
| cleaned = cleaned[first_brace :] | |
| # Trim again | |
| return cleaned.strip() | |
| # ------------------------- | |
| # File type detection | |
| # ------------------------- | |
| def detect_file_type(filename: str) -> str: | |
| name = (filename or "").lower() | |
| if name.endswith(".pdf"): | |
| return "pdf" | |
| if name.endswith(".docx"): | |
| return "docx" | |
| if name.endswith(".txt"): | |
| return "txt" | |
| return "unknown" | |
| # ------------------------- | |
| # Extract text (PDF/DOCX/TXT) | |
| # ------------------------- | |
| def extract_text_and_layout(file_bytes: bytes, kind: str): | |
| """ | |
| Returns (text, layout_info). layout_info is reserved for future use. | |
| """ | |
| try: | |
| if kind == "pdf": | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| texts = [] | |
| for page in doc: | |
| texts.append(page.get_text()) | |
| return "\n\n".join(texts), None | |
| elif kind == "docx": | |
| from docx import Document as DocReader | |
| doc = DocReader(io.BytesIO(file_bytes)) | |
| paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] | |
| return "\n\n".join(paragraphs), None | |
| elif kind == "txt": | |
| return file_bytes.decode("utf-8", errors="ignore"), None | |
| else: | |
| return "", None | |
| except Exception as e: | |
| raise RuntimeError(f"Text extraction failed: {e}") | |
| # ------------------------- | |
| # Gemini formatting -> JSON layout (UNIVERSAL) | |
| # ------------------------- | |
| def enhance_text_with_gemini(text: str, doc_type: str = "auto", user_prompt: str = "") -> str: | |
| """ | |
| Send prompt to Gemini to produce JSON document layout for ANY document type. | |
| For very long documents, uses a summarization strategy. | |
| Returns the raw string response (may include fences), caller should clean it. | |
| doc_type: "auto", "resume", "letter", "report", "article", "essay", etc. | |
| user_prompt: Custom instructions from user | |
| """ | |
| if not GEMINI_API_KEY: | |
| return json.dumps({"error": "GEMINI_API_KEY not set"}) | |
| # For very long documents, use a more concise output strategy | |
| is_long_doc = len(text) > 10000 | |
| # Build user instructions section | |
| user_instructions = "" | |
| if user_prompt.strip(): | |
| user_instructions = f""" | |
| USER INSTRUCTIONS: | |
| {user_prompt.strip()} | |
| IMPORTANT: Follow the user's instructions while maintaining the JSON format and document structure. | |
| """ | |
| # Universal prompt that works for ANY document | |
| prompt = f"""You are a professional document formatter and editor. Analyze the INPUT TEXT and enhance it. | |
| INPUT TEXT: | |
| {text} | |
| DOCUMENT TYPE: {doc_type} | |
| {user_instructions} | |
| TASK: | |
| 1) Analyze the document type (resume, cover letter, report, article, essay, notes, etc.) | |
| 2) {"Apply the user's specific instructions" if user_prompt.strip() else "Improve grammar, clarity, and professional tone while preserving all original information"} | |
| 3) Organize content with appropriate structure (headings, paragraphs, lists) | |
| 4) Output ONLY valid JSON following the schema below - NO markdown, NO commentary | |
| JSON SCHEMA: | |
| {{ | |
| "document": [ | |
| {{ "type": "heading", "level": 1, "text": "Main Title" }}, | |
| {{ "type": "heading", "level": 2, "text": "Section Title" }}, | |
| {{ "type": "paragraph", "text": "Regular paragraph text", "align": "left" }}, | |
| {{ "type": "bullet_list", "items": ["Item 1", "Item 2"] }}, | |
| {{ "type": "number_list", "items": ["Step 1", "Step 2"] }} | |
| ] | |
| }} | |
| FORMATTING RULES: | |
| - Use level 1 heading for document title only | |
| - Use level 2 headings for major sections | |
| - {"Keep paragraphs BRIEF - combine similar content" if is_long_doc else "Keep paragraphs concise and well-structured"} | |
| - Use bullet_list for unordered items, number_list for sequences | |
| - {"IMPORTANT: For long documents, be concise - summarize repetitive sections" if is_long_doc else "Preserve ALL original content - do not omit information"} | |
| - Keep the JSON compact - avoid unnecessary fields | |
| DOCUMENT-SPECIFIC GUIDELINES: | |
| - **Resume/CV**: Name (h1), Contact (center paragraph), Summary, Skills, Experience, Education, Certifications | |
| - **Cover Letter**: Your Info, Date, Recipient Info, Salutation, Body, Closing | |
| - **Report/Article**: Title (h1), Abstract, Introduction, Body Sections (h2), Conclusion | |
| - **Essay**: Title (h1, center), Author, Body paragraphs | |
| - **Notes/General**: Logical headings and structure | |
| IMPORTANT: | |
| - Return COMPLETE, VALID JSON only | |
| - Ensure all strings are properly closed with quotes | |
| - Ensure all brackets and braces are balanced | |
| - No trailing commas | |
| - No markdown fences | |
| - {"CRITICAL: Keep output under 30KB - be concise, combine similar sections" if is_long_doc else ""} | |
| """ | |
| payload = { | |
| "contents": [{"parts": [{"text": prompt}]}], | |
| "generationConfig": { | |
| "temperature": 0.3, | |
| "topP": 0.8, | |
| "topK": 40, | |
| } | |
| } | |
| try: | |
| res = requests.post(GEMINI_URL + f"?key={GEMINI_API_KEY}", json=payload, timeout=240) | |
| res.raise_for_status() | |
| resp_json = res.json() | |
| except Exception as e: | |
| return json.dumps({"error": f"gemini request failed: {str(e)}"}) | |
| try: | |
| content = resp_json["candidates"][0]["content"]["parts"][0]["text"] | |
| # Check if response looks truncated | |
| if not content.rstrip().endswith("}") and not content.rstrip().endswith("]"): | |
| # Try to repair JSON by closing it | |
| content = repair_truncated_json(content) | |
| return content | |
| except Exception: | |
| return json.dumps({"error": resp_json}) | |
| # ------------------------- | |
| # Advanced DOCX builder | |
| # ------------------------- | |
| def build_docx_from_design(layout_json_text: str) -> bytes: | |
| """ | |
| Convert JSON layout (string) into a polished DOCX binary. | |
| Raises ValueError on invalid JSON. | |
| """ | |
| cleaned = clean_gemini_json(layout_json_text) | |
| # Try to repair if truncated | |
| if not cleaned.rstrip().endswith('}'): | |
| cleaned = repair_truncated_json(cleaned) | |
| try: | |
| data = json.loads(cleaned) | |
| except json.JSONDecodeError as e: | |
| # If still fails, try to salvage what we can | |
| try: | |
| # Find the last valid complete object | |
| last_valid = cleaned.rfind('}') | |
| if last_valid != -1: | |
| # Try to close the document array | |
| test_json = cleaned[:last_valid + 1] + ']}' | |
| data = json.loads(test_json) | |
| else: | |
| raise ValueError(f"Could not parse JSON: {e}\nContent preview: {cleaned[:500]}...") | |
| except: | |
| raise ValueError(f"Invalid layout JSON: {e}\nContent preview: {cleaned[:500]}...") | |
| except Exception as e: | |
| raise ValueError(f"Invalid layout JSON: {e}\nRaw: {cleaned[:1000]}") | |
| doc = Document() | |
| # Set page margins | |
| sec = doc.sections[0] | |
| sec.top_margin = Inches(0.6) | |
| sec.bottom_margin = Inches(0.6) | |
| sec.left_margin = Inches(0.7) | |
| sec.right_margin = Inches(0.7) | |
| # Default font | |
| try: | |
| style = doc.styles["Normal"] | |
| style.font.name = "Calibri" | |
| style.font.size = Pt(11) | |
| except Exception: | |
| pass | |
| def add_heading_text(text: str, level: int = 1): | |
| h = doc.add_heading(level=level) | |
| run = h.add_run(text or "") | |
| run.bold = True | |
| run.font.size = Pt(18 if level == 1 else 14 if level == 2 else 12) | |
| h.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| def add_paragraph_text(text: str, bold=False, italic=False, align="left"): | |
| p = doc.add_paragraph() | |
| r = p.add_run(text or "") | |
| r.bold = bool(bold) | |
| r.italic = bool(italic) | |
| if align == "center": | |
| p.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| elif align == "right": | |
| p.alignment = WD_ALIGN_PARAGRAPH.RIGHT | |
| elif align == "justify": | |
| p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY | |
| else: | |
| p.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| return p | |
| for block in data.get("document", []): | |
| btype = block.get("type", "").lower() | |
| if btype == "heading": | |
| lvl = int(block.get("level", 1)) | |
| add_heading_text(block.get("text", ""), level=min(max(lvl, 1), 9)) | |
| elif btype == "paragraph": | |
| add_paragraph_text( | |
| block.get("text", ""), | |
| bold=block.get("bold", False), | |
| italic=block.get("italic", False), | |
| align=block.get("align", "left"), | |
| ) | |
| elif btype == "bullet_list" or (btype == "list" and block.get("kind") == "bulleted"): | |
| items = block.get("items", []) or [] | |
| for item in items: | |
| text = item.get("text") if isinstance(item, dict) else str(item) | |
| p = doc.add_paragraph(text, style="List Bullet") | |
| p.paragraph_format.left_indent = Inches(0.25) | |
| p.paragraph_format.space_after = Pt(6) | |
| elif btype == "number_list" or (btype == "list" and block.get("kind") == "numbered"): | |
| items = block.get("items", []) or [] | |
| for item in items: | |
| text = item.get("text") if isinstance(item, dict) else str(item) | |
| p = doc.add_paragraph(text, style="List Number") | |
| p.paragraph_format.left_indent = Inches(0.25) | |
| p.paragraph_format.space_after = Pt(6) | |
| elif btype == "table": | |
| rows = block.get("rows", []) or [] | |
| if not rows: | |
| continue | |
| cols = len(rows[0]) | |
| table = doc.add_table(rows=len(rows), cols=cols) | |
| table.style = "Table Grid" | |
| for r_idx, row in enumerate(rows): | |
| for c_idx, cell in enumerate(row): | |
| if isinstance(cell, dict): | |
| text = str(cell.get("text", "")) | |
| is_header = cell.get("is_header", False) | |
| else: | |
| text = str(cell) | |
| is_header = r_idx == 0 # Auto-detect first row as header | |
| cell_obj = table.rows[r_idx].cells[c_idx] | |
| cell_obj.text = text | |
| if is_header: | |
| for p in cell_obj.paragraphs: | |
| for run in p.runs: | |
| run.bold = True | |
| elif btype == "page_break": | |
| doc.add_page_break() | |
| elif btype == "section_break": | |
| new_sec = doc.add_section() | |
| margins = block.get("margins_pt", {}) | |
| try: | |
| if margins: | |
| top = margins.get("top") | |
| bottom = margins.get("bottom") | |
| left = margins.get("left") | |
| right = margins.get("right") | |
| if top: new_sec.top_margin = Pt(float(top)) | |
| if bottom: new_sec.bottom_margin = Pt(float(bottom)) | |
| if left: new_sec.left_margin = Pt(float(left)) | |
| if right: new_sec.right_margin = Pt(float(right)) | |
| except Exception: | |
| pass | |
| else: | |
| # Unknown block: add as paragraph | |
| add_paragraph_text(str(block)) | |
| # Finalize to bytes | |
| out = io.BytesIO() | |
| doc.save(out) | |
| out.seek(0) | |
| return out.getvalue() | |
| # ------------------------- | |
| # FastAPI routes | |
| # ------------------------- | |
| async def route_extract(file: UploadFile = File(...)): | |
| """Extract plain text from uploaded document.""" | |
| try: | |
| fb = await file.read() | |
| kind = detect_file_type(file.filename) | |
| text, _ = extract_text_and_layout(fb, kind) | |
| return {"text": text} | |
| except Exception as e: | |
| traceback.print_exc() | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def route_enhance( | |
| file: UploadFile = File(...), | |
| doc_type: str = "auto", | |
| prompt: str = "" | |
| ): | |
| """ | |
| Universal document enhancer - works for ANY document type. | |
| Extracts text, enhances with AI, returns formatted DOCX. | |
| Query/Form params: | |
| - doc_type: "auto", "resume", "letter", "report", "article", "essay" | |
| - prompt: User instructions for enhancement (optional) | |
| """ | |
| try: | |
| fb = await file.read() | |
| kind = detect_file_type(file.filename) | |
| text, _ = extract_text_and_layout(fb, kind) | |
| if not text.strip(): | |
| return JSONResponse({"error": "No text extracted from document"}, status_code=400) | |
| # Get enhanced JSON from Gemini with user prompt | |
| raw = enhance_text_with_gemini(text, doc_type, prompt) | |
| cleaned = clean_gemini_json(raw) | |
| # Check if Gemini returned an error | |
| try: | |
| test_parse = json.loads(cleaned) | |
| if "error" in test_parse: | |
| return JSONResponse({"error": test_parse["error"]}, status_code=500) | |
| except: | |
| pass | |
| # Build DOCX from JSON layout | |
| docx_bytes = build_docx_from_design(cleaned) | |
| # Sanitize filename for HTTP header | |
| safe_filename = sanitize_filename(f"Enhanced_{file.filename or 'document.docx'}") | |
| return StreamingResponse( | |
| io.BytesIO(docx_bytes), | |
| media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| headers={ | |
| "Content-Disposition": f'attachment; filename="{safe_filename}"' | |
| }, | |
| ) | |
| except ValueError as ve: | |
| traceback.print_exc() | |
| return JSONResponse({"error": f"JSON parsing error: {str(ve)}"}, status_code=400) | |
| except Exception as e: | |
| traceback.print_exc() | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def route_preview( | |
| file: UploadFile = File(...), | |
| doc_type: str = "auto", | |
| prompt: str = "" | |
| ): | |
| """ | |
| Preview the JSON layout without downloading DOCX. | |
| Useful for debugging and seeing the structure. | |
| """ | |
| try: | |
| fb = await file.read() | |
| kind = detect_file_type(file.filename) | |
| text, _ = extract_text_and_layout(fb, kind) | |
| if not text.strip(): | |
| return JSONResponse({"error": "No text extracted"}, status_code=400) | |
| raw = enhance_text_with_gemini(text, doc_type, prompt) | |
| cleaned = clean_gemini_json(raw) | |
| # Return the JSON structure for preview | |
| return {"layout_json": json.loads(cleaned)} | |
| except Exception as e: | |
| traceback.print_exc() | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def route_design( | |
| file: UploadFile = File(...), | |
| doc_type: str = "auto", | |
| prompt: str = "" | |
| ): | |
| """ | |
| Legacy endpoint - same as /enhance with auto detection. | |
| """ | |
| try: | |
| fb = await file.read() | |
| kind = detect_file_type(file.filename) | |
| text, _ = extract_text_and_layout(fb, kind) | |
| if not text.strip(): | |
| return JSONResponse({"error": "No text extracted"}, status_code=400) | |
| raw = enhance_text_with_gemini(text, doc_type, prompt) | |
| cleaned = clean_gemini_json(raw) | |
| docx_bytes = build_docx_from_design(cleaned) | |
| safe_filename = sanitize_filename(f"Professional_{file.filename or 'document.docx'}") | |
| return StreamingResponse( | |
| io.BytesIO(docx_bytes), | |
| media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| headers={ | |
| "Content-Disposition": f'attachment; filename="{safe_filename}"' | |
| }, | |
| ) | |
| except ValueError as ve: | |
| return JSONResponse({"error": str(ve)}, status_code=400) | |
| except Exception as e: | |
| traceback.print_exc() | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def route_full( | |
| file: UploadFile = File(...), | |
| doc_type: str = "auto", | |
| prompt: str = "" | |
| ): | |
| """ | |
| Full pipeline: extract -> Gemini JSON -> DOCX. | |
| Universal document enhancer. | |
| """ | |
| try: | |
| fb = await file.read() | |
| kind = detect_file_type(file.filename) | |
| text, _ = extract_text_and_layout(fb, kind) | |
| if not text.strip(): | |
| return JSONResponse({"error": "No text extracted"}, status_code=400) | |
| raw = enhance_text_with_gemini(text, doc_type, prompt) | |
| cleaned = clean_gemini_json(raw) | |
| docx_bytes = build_docx_from_design(cleaned) | |
| safe_filename = sanitize_filename(f"Enhanced_{file.filename or 'document.docx'}") | |
| return StreamingResponse( | |
| io.BytesIO(docx_bytes), | |
| media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| headers={ | |
| "Content-Disposition": f'attachment; filename="{safe_filename}"' | |
| }, | |
| ) | |
| except ValueError as ve: | |
| return JSONResponse({"error": str(ve)}, status_code=400) | |
| except Exception as e: | |
| traceback.print_exc() | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| # ------------------------- | |
| # Root | |
| # ------------------------- | |
| def root(): | |
| return { | |
| "service": "Universal Document Enhancer v5", | |
| "status": "ok", | |
| "description": "AI-powered document formatter for ANY document type", | |
| "supported_types": ["Resume/CV", "Cover Letter", "Report", "Article", "Essay", "Notes", "Any text document"], | |
| "endpoints": { | |
| "/extract": "Extract plain text from document", | |
| "/enhance": "Full pipeline: extract + AI enhancement + DOCX (RECOMMENDED)", | |
| "/preview": "Preview JSON layout without downloading", | |
| "/design": "Same as /enhance (legacy)", | |
| "/full": "Same as /enhance (legacy)" | |
| }, | |
| "usage": { | |
| "basic": "POST /enhance with file upload", | |
| "with_prompt": "POST /enhance?prompt=your_instructions&doc_type=auto" | |
| } | |
| } | |
| def health(): | |
| return { | |
| "status": "healthy", | |
| "gemini_configured": bool(GEMINI_API_KEY), | |
| "version": "5.0 - Universal with Prompt Support" | |
| } |