omgy commited on
Commit
0c9040f
·
verified ·
1 Parent(s): fdddc6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +255 -127
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py -- Backend v5 (Gemini -> JSON -> Advanced DOCX)
2
  # Universal Document Enhancer - Works for ANY document type
3
  import os
4
  import io
@@ -10,16 +10,16 @@ import base64
10
  from fastapi import FastAPI, File, UploadFile, Form
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import JSONResponse, StreamingResponse
13
- from typing import Optional
14
  import fitz # pymupdf
15
  from docx import Document
16
  from docx.shared import Pt, Inches, RGBColor
17
  from docx.enum.text import WD_ALIGN_PARAGRAPH
18
- from docx.oxml.ns import qn
19
  from PIL import Image
20
  import requests
21
 
22
- app = FastAPI(title="Document Enhancer v5")
23
 
24
  app.add_middleware(
25
  CORSMiddleware,
@@ -62,30 +62,20 @@ def sanitize_filename(filename: str) -> str:
62
  if not filename:
63
  return "document.docx"
64
 
65
- # Remove file extension first
66
  name_without_ext = filename.rsplit('.', 1)[0] if '.' in filename else filename
67
 
68
- # Convert to ASCII, removing/replacing non-ASCII characters
69
- # This will convert – to -, é to e, etc.
70
  try:
71
  ascii_name = name_without_ext.encode('ascii', 'ignore').decode('ascii')
72
  except:
73
  ascii_name = "document"
74
 
75
- # If nothing left after ASCII conversion, use default
76
  if not ascii_name or not ascii_name.strip():
77
  ascii_name = "document"
78
 
79
- # Remove any remaining problematic characters (keep only alphanumeric, spaces, hyphens, underscores)
80
  safe_name = re.sub(r'[^\w\s\-]', '', ascii_name)
81
-
82
- # Replace multiple spaces/hyphens with single underscore
83
  safe_name = re.sub(r'[\s\-]+', '_', safe_name)
84
-
85
- # Trim and ensure not empty
86
  safe_name = safe_name.strip('_') or "document"
87
 
88
- # Add .docx extension
89
  return f"{safe_name}.docx"
90
 
91
 
@@ -97,35 +87,27 @@ def repair_truncated_json(json_str: str) -> str:
97
  Attempt to repair truncated JSON by closing open structures.
98
  """
99
  try:
100
- # Count open/close brackets
101
  open_braces = json_str.count('{')
102
  close_braces = json_str.count('}')
103
  open_brackets = json_str.count('[')
104
  close_brackets = json_str.count(']')
105
 
106
- # If JSON is truncated mid-string, remove the incomplete part
107
  if json_str.rstrip().endswith('"'):
108
- # Find the last complete object
109
  last_complete = json_str.rfind('"}')
110
  if last_complete != -1:
111
  json_str = json_str[:last_complete + 2]
112
  else:
113
- # Remove trailing incomplete text
114
  last_quote = json_str.rfind('"')
115
  if last_quote != -1:
116
- # Check if this quote is part of an incomplete string
117
  after_quote = json_str[last_quote + 1:].strip()
118
  if after_quote and after_quote[0] not in [',', '}', ']']:
119
- # Incomplete string, remove it
120
  prev_comma = json_str.rfind(',', 0, last_quote)
121
  if prev_comma != -1:
122
  json_str = json_str[:prev_comma]
123
 
124
- # Close any open arrays
125
  for _ in range(open_brackets - close_brackets):
126
  json_str += ']'
127
 
128
- # Close any open objects
129
  for _ in range(open_braces - close_braces):
130
  json_str += '}'
131
 
@@ -139,30 +121,23 @@ def repair_truncated_json(json_str: str) -> str:
139
  # -------------------------
140
  def clean_gemini_json(raw_text: str) -> str:
141
  """
142
- Removes markdown code fences and wrappers from Gemini output so it becomes
143
- valid JSON text for json.loads().
144
  """
145
  if not raw_text:
146
  return raw_text
147
  cleaned = raw_text.strip()
148
 
149
- # Remove triple-backticks blocks like ```json ... ```
150
  if cleaned.startswith("```"):
151
- # remove first fence and any language marker
152
  first_line_end = cleaned.find("\n")
153
  if first_line_end != -1:
154
  cleaned = cleaned[first_line_end + 1 :]
155
- # strip trailing ```
156
  if cleaned.endswith("```"):
157
  cleaned = cleaned[: -3]
158
 
159
- # Remove single-line wrappers like: json: { ... }
160
- # Remove leading words until first brace
161
  first_brace = cleaned.find("{")
162
  if first_brace > 0:
163
  cleaned = cleaned[first_brace :]
164
 
165
- # Trim again
166
  return cleaned.strip()
167
 
168
 
@@ -181,57 +156,122 @@ def detect_file_type(filename: str) -> str:
181
 
182
 
183
  # -------------------------
184
- # Extract text (PDF/DOCX/TXT)
185
  # -------------------------
186
  def extract_text_and_layout(file_bytes: bytes, kind: str):
187
  """
188
- Returns (text, layout_info). layout_info is reserved for future use.
 
 
189
  """
 
 
190
  try:
191
  if kind == "pdf":
192
  doc = fitz.open(stream=file_bytes, filetype="pdf")
193
  texts = []
194
- for page in doc:
 
 
195
  texts.append(page.get_text())
196
- return "\n\n".join(texts), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  elif kind == "docx":
199
  from docx import Document as DocReader
200
 
201
  doc = DocReader(io.BytesIO(file_bytes))
202
- paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
203
- return "\n\n".join(paragraphs), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  elif kind == "txt":
206
- return file_bytes.decode("utf-8", errors="ignore"), None
207
 
208
  else:
209
- return "", None
210
 
211
  except Exception as e:
212
  raise RuntimeError(f"Text extraction failed: {e}")
213
 
214
 
215
  # -------------------------
216
- # Gemini formatting -> JSON layout (UNIVERSAL)
217
  # -------------------------
218
- def enhance_text_with_gemini(text: str, doc_type: str = "auto", user_prompt: str = "") -> str:
 
219
  """
220
- Send prompt to Gemini to produce JSON document layout for ANY document type.
221
- Tries multiple API keys if one fails (rate limit, quota exceeded, etc.).
222
- For very long documents, uses a summarization strategy.
223
- Returns the raw string response (may include fences), caller should clean it.
224
-
225
- doc_type: "auto", "resume", "letter", "report", "article", "essay", etc.
226
- user_prompt: Custom instructions from user
227
  """
228
  if not GEMINI_API_KEYS:
229
  return json.dumps({"error": "No GEMINI_API_KEY configured"})
230
 
231
- # For very long documents, use a more concise output strategy
232
  is_long_doc = len(text) > 10000
233
 
234
- # Build user instructions section
235
  user_instructions = ""
236
  if user_prompt.strip():
237
  user_instructions = f"""
@@ -241,49 +281,74 @@ USER INSTRUCTIONS:
241
  IMPORTANT: Follow the user's instructions while maintaining the JSON format and document structure.
242
  """
243
 
244
- # Universal prompt that works for ANY document
 
 
 
 
 
 
 
245
  prompt = f"""You are a professional document formatter and editor. Analyze the INPUT TEXT and enhance it.
246
 
247
  INPUT TEXT:
248
  {text}
 
249
 
250
  DOCUMENT TYPE: {doc_type}
251
  {user_instructions}
 
252
  TASK:
253
  1) Analyze the document type (resume, cover letter, report, article, essay, notes, etc.)
254
  2) {"Apply the user's specific instructions" if user_prompt.strip() else "Improve grammar, clarity, and professional tone while preserving all original information"}
255
- 3) Organize content with appropriate structure (headings, paragraphs, lists)
256
- 4) Output ONLY valid JSON following the schema below - NO markdown, NO commentary
 
257
 
258
  JSON SCHEMA:
259
  {{
260
  "document": [
261
- {{ "type": "heading", "level": 1, "text": "Main Title" }},
262
  {{ "type": "heading", "level": 2, "text": "Section Title" }},
263
- {{ "type": "paragraph", "text": "Regular paragraph text", "align": "left" }},
264
- {{ "type": "bullet_list", "items": ["Item 1", "Item 2"] }},
265
- {{ "type": "number_list", "items": ["Step 1", "Step 2"] }}
 
 
266
  ]
267
  }}
268
 
269
  FORMATTING RULES:
270
  - Use level 1 heading for document title only
271
  - Use level 2 headings for major sections
 
272
  - {"Keep paragraphs BRIEF - combine similar content" if is_long_doc else "Keep paragraphs concise and well-structured"}
273
  - Use bullet_list for unordered items, number_list for sequences
 
 
274
  - {"IMPORTANT: For long documents, be concise - summarize repetitive sections" if is_long_doc else "Preserve ALL original content - do not omit information"}
275
- - Keep the JSON compact - avoid unnecessary fields
 
 
 
 
 
 
 
 
 
276
 
277
  DOCUMENT-SPECIFIC GUIDELINES:
278
  - **Resume/CV**: Name (h1), Contact (center paragraph), Summary, Skills, Experience, Education, Certifications
279
  - **Cover Letter**: Your Info, Date, Recipient Info, Salutation, Body, Closing
280
- - **Report/Article**: Title (h1), Abstract, Introduction, Body Sections (h2), Conclusion
281
  - **Essay**: Title (h1, center), Author, Body paragraphs
282
- - **Notes/General**: Logical headings and structure
283
 
284
  IMPORTANT:
285
  - Return COMPLETE, VALID JSON only
286
  - Ensure all strings are properly closed with quotes
 
287
  - Ensure all brackets and braces are balanced
288
  - No trailing commas
289
  - No markdown fences
@@ -299,7 +364,7 @@ IMPORTANT:
299
  }
300
  }
301
 
302
- # Try each API key in sequence until one works
303
  last_error = None
304
  for idx, api_key in enumerate(GEMINI_API_KEYS):
305
  try:
@@ -311,7 +376,6 @@ IMPORTANT:
311
  timeout=240
312
  )
313
 
314
- # Check for rate limit or quota errors
315
  if res.status_code == 429:
316
  print(f"API key #{idx + 1} rate limited, trying next...")
317
  last_error = "Rate limit exceeded"
@@ -325,10 +389,8 @@ IMPORTANT:
325
  res.raise_for_status()
326
  resp_json = res.json()
327
 
328
- # Extract content
329
  content = resp_json["candidates"][0]["content"]["parts"][0]["text"]
330
 
331
- # Check if response looks truncated
332
  if not content.rstrip().endswith("}") and not content.rstrip().endswith("]"):
333
  content = repair_truncated_json(content)
334
 
@@ -350,35 +412,29 @@ IMPORTANT:
350
  last_error = str(e)
351
  continue
352
 
353
- # All API keys failed
354
  return json.dumps({
355
  "error": f"All {len(GEMINI_API_KEYS)} API keys failed. Last error: {last_error}"
356
  })
357
 
358
 
359
  # -------------------------
360
- # Advanced DOCX builder
361
  # -------------------------
362
- def build_docx_from_design(layout_json_text: str) -> bytes:
363
  """
364
- Convert JSON layout (string) into a polished DOCX binary.
365
- Raises ValueError on invalid JSON.
366
  """
367
  cleaned = clean_gemini_json(layout_json_text)
368
 
369
- # Try to repair if truncated
370
  if not cleaned.rstrip().endswith('}'):
371
  cleaned = repair_truncated_json(cleaned)
372
 
373
  try:
374
  data = json.loads(cleaned)
375
  except json.JSONDecodeError as e:
376
- # If still fails, try to salvage what we can
377
  try:
378
- # Find the last valid complete object
379
  last_valid = cleaned.rfind('}')
380
  if last_valid != -1:
381
- # Try to close the document array
382
  test_json = cleaned[:last_valid + 1] + ']}'
383
  data = json.loads(test_json)
384
  else:
@@ -397,7 +453,7 @@ def build_docx_from_design(layout_json_text: str) -> bytes:
397
  sec.left_margin = Inches(0.7)
398
  sec.right_margin = Inches(0.7)
399
 
400
- # Default font
401
  try:
402
  style = doc.styles["Normal"]
403
  style.font.name = "Calibri"
@@ -427,6 +483,9 @@ def build_docx_from_design(layout_json_text: str) -> bytes:
427
  p.alignment = WD_ALIGN_PARAGRAPH.LEFT
428
  return p
429
 
 
 
 
430
  for block in data.get("document", []):
431
  btype = block.get("type", "").lower()
432
 
@@ -462,24 +521,73 @@ def build_docx_from_design(layout_json_text: str) -> bytes:
462
  rows = block.get("rows", []) or []
463
  if not rows:
464
  continue
465
- cols = len(rows[0])
 
 
 
 
466
  table = doc.add_table(rows=len(rows), cols=cols)
467
  table.style = "Table Grid"
 
 
 
468
  for r_idx, row in enumerate(rows):
469
  for c_idx, cell in enumerate(row):
470
  if isinstance(cell, dict):
471
  text = str(cell.get("text", ""))
472
- is_header = cell.get("is_header", False)
473
  else:
474
  text = str(cell)
475
- is_header = r_idx == 0 # Auto-detect first row as header
476
  cell_obj = table.rows[r_idx].cells[c_idx]
477
  cell_obj.text = text
478
- if is_header:
 
 
479
  for p in cell_obj.paragraphs:
480
  for run in p.runs:
481
  run.bold = True
482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  elif btype == "page_break":
484
  doc.add_page_break()
485
 
@@ -515,12 +623,16 @@ def build_docx_from_design(layout_json_text: str) -> bytes:
515
  # -------------------------
516
  @app.post("/extract")
517
  async def route_extract(file: UploadFile = File(...)):
518
- """Extract plain text from uploaded document."""
519
  try:
520
  fb = await file.read()
521
  kind = detect_file_type(file.filename)
522
- text, _ = extract_text_and_layout(fb, kind)
523
- return {"text": text}
 
 
 
 
524
  except Exception as e:
525
  traceback.print_exc()
526
  return JSONResponse({"error": str(e)}, status_code=500)
@@ -533,26 +645,20 @@ async def route_enhance(
533
  prompt: str = ""
534
  ):
535
  """
536
- Universal document enhancer - works for ANY document type.
537
- Extracts text, enhances with AI, returns formatted DOCX.
538
-
539
- Query/Form params:
540
- - doc_type: "auto", "resume", "letter", "report", "article", "essay"
541
- - prompt: User instructions for enhancement (optional)
542
  """
543
  try:
544
  fb = await file.read()
545
  kind = detect_file_type(file.filename)
546
- text, _ = extract_text_and_layout(fb, kind)
547
 
548
  if not text.strip():
549
  return JSONResponse({"error": "No text extracted from document"}, status_code=400)
550
 
551
- # Get enhanced JSON from Gemini with user prompt
552
- raw = enhance_text_with_gemini(text, doc_type, prompt)
553
  cleaned = clean_gemini_json(raw)
554
 
555
- # Check if Gemini returned an error
556
  try:
557
  test_parse = json.loads(cleaned)
558
  if "error" in test_parse:
@@ -560,10 +666,9 @@ async def route_enhance(
560
  except:
561
  pass
562
 
563
- # Build DOCX from JSON layout
564
- docx_bytes = build_docx_from_design(cleaned)
565
 
566
- # Sanitize filename for HTTP header
567
  safe_filename = sanitize_filename(f"Enhanced_{file.filename or 'document.docx'}")
568
 
569
  return StreamingResponse(
@@ -588,22 +693,26 @@ async def route_preview(
588
  prompt: str = ""
589
  ):
590
  """
591
- Preview the JSON layout without downloading DOCX.
592
- Useful for debugging and seeing the structure.
593
  """
594
  try:
595
  fb = await file.read()
596
  kind = detect_file_type(file.filename)
597
- text, _ = extract_text_and_layout(fb, kind)
598
 
599
  if not text.strip():
600
  return JSONResponse({"error": "No text extracted"}, status_code=400)
601
 
602
- raw = enhance_text_with_gemini(text, doc_type, prompt)
603
  cleaned = clean_gemini_json(raw)
604
 
605
- # Return the JSON structure for preview
606
- return {"layout_json": json.loads(cleaned)}
 
 
 
 
 
607
  except Exception as e:
608
  traceback.print_exc()
609
  return JSONResponse({"error": str(e)}, status_code=500)
@@ -615,20 +724,18 @@ async def route_design(
615
  doc_type: str = "auto",
616
  prompt: str = ""
617
  ):
618
- """
619
- Legacy endpoint - same as /enhance with auto detection.
620
- """
621
  try:
622
  fb = await file.read()
623
  kind = detect_file_type(file.filename)
624
- text, _ = extract_text_and_layout(fb, kind)
625
 
626
  if not text.strip():
627
  return JSONResponse({"error": "No text extracted"}, status_code=400)
628
 
629
- raw = enhance_text_with_gemini(text, doc_type, prompt)
630
  cleaned = clean_gemini_json(raw)
631
- docx_bytes = build_docx_from_design(cleaned)
632
 
633
  safe_filename = sanitize_filename(f"Professional_{file.filename or 'document.docx'}")
634
 
@@ -652,21 +759,18 @@ async def route_full(
652
  doc_type: str = "auto",
653
  prompt: str = ""
654
  ):
655
- """
656
- Full pipeline: extract -> Gemini JSON -> DOCX.
657
- Universal document enhancer.
658
- """
659
  try:
660
  fb = await file.read()
661
  kind = detect_file_type(file.filename)
662
- text, _ = extract_text_and_layout(fb, kind)
663
 
664
  if not text.strip():
665
  return JSONResponse({"error": "No text extracted"}, status_code=400)
666
 
667
- raw = enhance_text_with_gemini(text, doc_type, prompt)
668
  cleaned = clean_gemini_json(raw)
669
- docx_bytes = build_docx_from_design(cleaned)
670
 
671
  safe_filename = sanitize_filename(f"Enhanced_{file.filename or 'document.docx'}")
672
 
@@ -687,8 +791,8 @@ async def route_full(
687
  @app.post("/add-signature")
688
  async def route_add_signature(
689
  file: UploadFile = File(...),
690
- signature: str = Form(...), # Base64 encoded image
691
- position: str = Form("bottom-right"), # bottom-right, bottom-center, bottom-left, custom
692
  signer_name: Optional[str] = Form(None)
693
  ):
694
  """
@@ -721,11 +825,7 @@ async def route_add_signature(
721
  except Exception as e:
722
  return JSONResponse({"error": f"Invalid signature image: {str(e)}"}, status_code=400)
723
 
724
- # Get the last section (last page)
725
- last_section = doc.sections[-1]
726
-
727
  # Add signature at the end of document
728
- # Add some spacing before signature
729
  doc.add_paragraph()
730
 
731
  # Create signature paragraph
@@ -777,21 +877,38 @@ async def route_add_signature(
777
  @app.get("/")
778
  def root():
779
  return {
780
- "service": "Universal Document Enhancer v5",
781
  "status": "ok",
782
- "description": "AI-powered document formatter for ANY document type",
783
  "supported_types": ["Resume/CV", "Cover Letter", "Report", "Article", "Essay", "Notes", "Any text document"],
 
 
 
 
 
 
 
 
784
  "endpoints": {
785
- "/extract": "Extract plain text from document",
786
- "/enhance": "Full pipeline: extract + AI enhancement + DOCX (RECOMMENDED)",
787
  "/add-signature": "Add signature to existing DOCX file",
788
- "/preview": "Preview JSON layout without downloading",
789
  "/design": "Same as /enhance (legacy)",
790
  "/full": "Same as /enhance (legacy)"
791
  },
792
  "usage": {
793
  "basic": "POST /enhance with file upload",
794
- "with_prompt": "POST /enhance?prompt=your_instructions&doc_type=auto"
 
 
 
 
 
 
 
 
 
795
  }
796
  }
797
 
@@ -801,6 +918,17 @@ def health():
801
  return {
802
  "status": "healthy",
803
  "api_keys_configured": len(GEMINI_API_KEYS),
804
- "api_keys_available": GEMINI_API_KEYS[:1] if GEMINI_API_KEYS else [], # Only show first key for security
805
- "version": "5.0 - Universal with Prompt Support + Multi-API-Key Fallback"
806
- }
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py -- Backend v6 (Enhanced: Tables, Images, Emojis, Icons)
2
  # Universal Document Enhancer - Works for ANY document type
3
  import os
4
  import io
 
10
  from fastapi import FastAPI, File, UploadFile, Form
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import JSONResponse, StreamingResponse
13
+ from typing import Optional, List, Dict, Any
14
  import fitz # pymupdf
15
  from docx import Document
16
  from docx.shared import Pt, Inches, RGBColor
17
  from docx.enum.text import WD_ALIGN_PARAGRAPH
18
+ from docx.oxml.shared import OxmlElement, qn
19
  from PIL import Image
20
  import requests
21
 
22
+ app = FastAPI(title="Document Enhancer v6")
23
 
24
  app.add_middleware(
25
  CORSMiddleware,
 
62
  if not filename:
63
  return "document.docx"
64
 
 
65
  name_without_ext = filename.rsplit('.', 1)[0] if '.' in filename else filename
66
 
 
 
67
  try:
68
  ascii_name = name_without_ext.encode('ascii', 'ignore').decode('ascii')
69
  except:
70
  ascii_name = "document"
71
 
 
72
  if not ascii_name or not ascii_name.strip():
73
  ascii_name = "document"
74
 
 
75
  safe_name = re.sub(r'[^\w\s\-]', '', ascii_name)
 
 
76
  safe_name = re.sub(r'[\s\-]+', '_', safe_name)
 
 
77
  safe_name = safe_name.strip('_') or "document"
78
 
 
79
  return f"{safe_name}.docx"
80
 
81
 
 
87
  Attempt to repair truncated JSON by closing open structures.
88
  """
89
  try:
 
90
  open_braces = json_str.count('{')
91
  close_braces = json_str.count('}')
92
  open_brackets = json_str.count('[')
93
  close_brackets = json_str.count(']')
94
 
 
95
  if json_str.rstrip().endswith('"'):
 
96
  last_complete = json_str.rfind('"}')
97
  if last_complete != -1:
98
  json_str = json_str[:last_complete + 2]
99
  else:
 
100
  last_quote = json_str.rfind('"')
101
  if last_quote != -1:
 
102
  after_quote = json_str[last_quote + 1:].strip()
103
  if after_quote and after_quote[0] not in [',', '}', ']']:
 
104
  prev_comma = json_str.rfind(',', 0, last_quote)
105
  if prev_comma != -1:
106
  json_str = json_str[:prev_comma]
107
 
 
108
  for _ in range(open_brackets - close_brackets):
109
  json_str += ']'
110
 
 
111
  for _ in range(open_braces - close_braces):
112
  json_str += '}'
113
 
 
121
  # -------------------------
122
  def clean_gemini_json(raw_text: str) -> str:
123
  """
124
+ Removes markdown code fences and wrappers from Gemini output.
 
125
  """
126
  if not raw_text:
127
  return raw_text
128
  cleaned = raw_text.strip()
129
 
 
130
  if cleaned.startswith("```"):
 
131
  first_line_end = cleaned.find("\n")
132
  if first_line_end != -1:
133
  cleaned = cleaned[first_line_end + 1 :]
 
134
  if cleaned.endswith("```"):
135
  cleaned = cleaned[: -3]
136
 
 
 
137
  first_brace = cleaned.find("{")
138
  if first_brace > 0:
139
  cleaned = cleaned[first_brace :]
140
 
 
141
  return cleaned.strip()
142
 
143
 
 
156
 
157
 
158
  # -------------------------
159
+ # Extract text, tables, and images (ENHANCED)
160
  # -------------------------
161
  def extract_text_and_layout(file_bytes: bytes, kind: str):
162
  """
163
+ Returns (text, extracted_data) where extracted_data contains:
164
+ - tables: list of table data
165
+ - images: list of base64 encoded images with positions
166
  """
167
+ extracted_data = {"tables": [], "images": []}
168
+
169
  try:
170
  if kind == "pdf":
171
  doc = fitz.open(stream=file_bytes, filetype="pdf")
172
  texts = []
173
+
174
+ for page_num, page in enumerate(doc):
175
+ # Extract text
176
  texts.append(page.get_text())
177
+
178
+ # Extract tables
179
+ tables = page.find_tables()
180
+ for table_idx, table in enumerate(tables):
181
+ try:
182
+ table_data = table.extract()
183
+ if table_data:
184
+ extracted_data["tables"].append({
185
+ "page": page_num + 1,
186
+ "data": table_data,
187
+ "position": f"page_{page_num + 1}_table_{table_idx + 1}"
188
+ })
189
+ except:
190
+ pass
191
+
192
+ # Extract images
193
+ image_list = page.get_images()
194
+ for img_idx, img in enumerate(image_list):
195
+ try:
196
+ xref = img[0]
197
+ base_image = doc.extract_image(xref)
198
+ image_bytes = base_image["image"]
199
+ image_base64 = base64.b64encode(image_bytes).decode()
200
+
201
+ extracted_data["images"].append({
202
+ "page": page_num + 1,
203
+ "data": image_base64,
204
+ "ext": base_image["ext"],
205
+ "position": f"page_{page_num + 1}_img_{img_idx + 1}"
206
+ })
207
+ except:
208
+ pass
209
+
210
+ return "\n\n".join(texts), extracted_data
211
 
212
  elif kind == "docx":
213
  from docx import Document as DocReader
214
 
215
  doc = DocReader(io.BytesIO(file_bytes))
216
+ texts = []
217
+
218
+ # Extract paragraphs
219
+ for para in doc.paragraphs:
220
+ if para.text.strip():
221
+ texts.append(para.text)
222
+
223
+ # Extract tables
224
+ for table_idx, table in enumerate(doc.tables):
225
+ table_data = []
226
+ for row in table.rows:
227
+ row_data = [cell.text for cell in row.cells]
228
+ table_data.append(row_data)
229
+
230
+ if table_data:
231
+ extracted_data["tables"].append({
232
+ "data": table_data,
233
+ "position": f"table_{table_idx + 1}"
234
+ })
235
+
236
+ # Extract images (inline shapes)
237
+ for rel in doc.part.rels.values():
238
+ if "image" in rel.target_ref:
239
+ try:
240
+ image_bytes = rel.target_part.blob
241
+ image_base64 = base64.b64encode(image_bytes).decode()
242
+ extracted_data["images"].append({
243
+ "data": image_base64,
244
+ "position": f"image_{len(extracted_data['images']) + 1}"
245
+ })
246
+ except:
247
+ pass
248
+
249
+ return "\n\n".join(texts), extracted_data
250
 
251
  elif kind == "txt":
252
+ return file_bytes.decode("utf-8", errors="ignore"), extracted_data
253
 
254
  else:
255
+ return "", extracted_data
256
 
257
  except Exception as e:
258
  raise RuntimeError(f"Text extraction failed: {e}")
259
 
260
 
261
  # -------------------------
262
+ # Gemini formatting -> JSON layout (ENHANCED with tables & images)
263
  # -------------------------
264
+ def enhance_text_with_gemini(text: str, doc_type: str = "auto", user_prompt: str = "",
265
+ extracted_data: Dict = None) -> str:
266
  """
267
+ Enhanced: Now includes table and image information in the prompt.
 
 
 
 
 
 
268
  """
269
  if not GEMINI_API_KEYS:
270
  return json.dumps({"error": "No GEMINI_API_KEY configured"})
271
 
 
272
  is_long_doc = len(text) > 10000
273
 
274
+ # Build user instructions
275
  user_instructions = ""
276
  if user_prompt.strip():
277
  user_instructions = f"""
 
281
  IMPORTANT: Follow the user's instructions while maintaining the JSON format and document structure.
282
  """
283
 
284
+ # Add information about extracted tables and images
285
+ extracted_info = ""
286
+ if extracted_data:
287
+ if extracted_data.get("tables"):
288
+ extracted_info += f"\n\nDOCUMENT CONTAINS {len(extracted_data['tables'])} TABLES. Preserve and format them appropriately."
289
+ if extracted_data.get("images"):
290
+ extracted_info += f"\nDOCUMENT CONTAINS {len(extracted_data['images'])} IMAGES. Note their positions for reference."
291
+
292
  prompt = f"""You are a professional document formatter and editor. Analyze the INPUT TEXT and enhance it.
293
 
294
  INPUT TEXT:
295
  {text}
296
+ {extracted_info}
297
 
298
  DOCUMENT TYPE: {doc_type}
299
  {user_instructions}
300
+
301
  TASK:
302
  1) Analyze the document type (resume, cover letter, report, article, essay, notes, etc.)
303
  2) {"Apply the user's specific instructions" if user_prompt.strip() else "Improve grammar, clarity, and professional tone while preserving all original information"}
304
+ 3) Organize content with appropriate structure (headings, paragraphs, lists, tables)
305
+ 4) Preserve emojis, special characters, and Unicode symbols
306
+ 5) Output ONLY valid JSON following the schema below - NO markdown, NO commentary
307
 
308
  JSON SCHEMA:
309
  {{
310
  "document": [
311
+ {{ "type": "heading", "level": 1, "text": "Main Title" }},
312
  {{ "type": "heading", "level": 2, "text": "Section Title" }},
313
+ {{ "type": "paragraph", "text": "Regular text with emojis 😊", "align": "left" }},
314
+ {{ "type": "bullet_list", "items": ["Item 1", "Item 2"] }},
315
+ {{ "type": "number_list", "items": ["Step 1", "Step 2"] }},
316
+ {{ "type": "table", "rows": [["Header1", "Header2"], ["Value1", "Value2"]], "has_header": true }},
317
+ {{ "type": "image_placeholder", "position": "center", "caption": "Figure 1: Description" }}
318
  ]
319
  }}
320
 
321
  FORMATTING RULES:
322
  - Use level 1 heading for document title only
323
  - Use level 2 headings for major sections
324
+ - PRESERVE all emojis, Unicode symbols (★, ✓, →, •, etc.), and special characters
325
  - {"Keep paragraphs BRIEF - combine similar content" if is_long_doc else "Keep paragraphs concise and well-structured"}
326
  - Use bullet_list for unordered items, number_list for sequences
327
+ - Use "table" type for tabular data with "has_header": true/false
328
+ - Use "image_placeholder" to mark where images should be inserted
329
  - {"IMPORTANT: For long documents, be concise - summarize repetitive sections" if is_long_doc else "Preserve ALL original content - do not omit information"}
330
+
331
+ TABLE FORMATTING:
332
+ - First row is typically headers (set "has_header": true)
333
+ - Include all rows and columns from source
334
+ - Preserve cell content including numbers, symbols, emojis
335
+
336
+ EMOJI & SYMBOL SUPPORT:
337
+ - Keep ALL emojis exactly as they appear (😊, 🎉, ❤️, etc.)
338
+ - Preserve Unicode symbols (★, ✓, →, •, ©, ®, ™, etc.)
339
+ - Maintain special characters (€, £, ¥, °, ±, etc.)
340
 
341
  DOCUMENT-SPECIFIC GUIDELINES:
342
  - **Resume/CV**: Name (h1), Contact (center paragraph), Summary, Skills, Experience, Education, Certifications
343
  - **Cover Letter**: Your Info, Date, Recipient Info, Salutation, Body, Closing
344
+ - **Report/Article**: Title (h1), Abstract, Introduction, Body Sections (h2), Tables, Figures, Conclusion
345
  - **Essay**: Title (h1, center), Author, Body paragraphs
346
+ - **Notes/General**: Logical headings, preserve lists, tables, and special formatting
347
 
348
  IMPORTANT:
349
  - Return COMPLETE, VALID JSON only
350
  - Ensure all strings are properly closed with quotes
351
+ - Escape special JSON characters properly
352
  - Ensure all brackets and braces are balanced
353
  - No trailing commas
354
  - No markdown fences
 
364
  }
365
  }
366
 
367
+ # Try each API key in sequence
368
  last_error = None
369
  for idx, api_key in enumerate(GEMINI_API_KEYS):
370
  try:
 
376
  timeout=240
377
  )
378
 
 
379
  if res.status_code == 429:
380
  print(f"API key #{idx + 1} rate limited, trying next...")
381
  last_error = "Rate limit exceeded"
 
389
  res.raise_for_status()
390
  resp_json = res.json()
391
 
 
392
  content = resp_json["candidates"][0]["content"]["parts"][0]["text"]
393
 
 
394
  if not content.rstrip().endswith("}") and not content.rstrip().endswith("]"):
395
  content = repair_truncated_json(content)
396
 
 
412
  last_error = str(e)
413
  continue
414
 
 
415
  return json.dumps({
416
  "error": f"All {len(GEMINI_API_KEYS)} API keys failed. Last error: {last_error}"
417
  })
418
 
419
 
420
  # -------------------------
421
+ # Advanced DOCX builder (ENHANCED with tables, images, emojis)
422
  # -------------------------
423
+ def build_docx_from_design(layout_json_text: str, extracted_data: Dict = None) -> bytes:
424
  """
425
+ Enhanced: Now supports tables, images, emojis, and Unicode symbols.
 
426
  """
427
  cleaned = clean_gemini_json(layout_json_text)
428
 
 
429
  if not cleaned.rstrip().endswith('}'):
430
  cleaned = repair_truncated_json(cleaned)
431
 
432
  try:
433
  data = json.loads(cleaned)
434
  except json.JSONDecodeError as e:
 
435
  try:
 
436
  last_valid = cleaned.rfind('}')
437
  if last_valid != -1:
 
438
  test_json = cleaned[:last_valid + 1] + ']}'
439
  data = json.loads(test_json)
440
  else:
 
453
  sec.left_margin = Inches(0.7)
454
  sec.right_margin = Inches(0.7)
455
 
456
+ # Default font (supports Unicode)
457
  try:
458
  style = doc.styles["Normal"]
459
  style.font.name = "Calibri"
 
483
  p.alignment = WD_ALIGN_PARAGRAPH.LEFT
484
  return p
485
 
486
+ # Track image usage
487
+ image_counter = 0
488
+
489
  for block in data.get("document", []):
490
  btype = block.get("type", "").lower()
491
 
 
521
  rows = block.get("rows", []) or []
522
  if not rows:
523
  continue
524
+
525
+ cols = len(rows[0]) if rows else 0
526
+ if cols == 0:
527
+ continue
528
+
529
  table = doc.add_table(rows=len(rows), cols=cols)
530
  table.style = "Table Grid"
531
+
532
+ has_header = block.get("has_header", True)
533
+
534
  for r_idx, row in enumerate(rows):
535
  for c_idx, cell in enumerate(row):
536
  if isinstance(cell, dict):
537
  text = str(cell.get("text", ""))
 
538
  else:
539
  text = str(cell)
540
+
541
  cell_obj = table.rows[r_idx].cells[c_idx]
542
  cell_obj.text = text
543
+
544
+ # Bold header row
545
+ if has_header and r_idx == 0:
546
  for p in cell_obj.paragraphs:
547
  for run in p.runs:
548
  run.bold = True
549
 
550
+ elif btype == "image_placeholder" or btype == "image":
551
+ # Insert image from extracted data
552
+ if extracted_data and extracted_data.get("images"):
553
+ if image_counter < len(extracted_data["images"]):
554
+ try:
555
+ img_data = extracted_data["images"][image_counter]
556
+ image_bytes = base64.b64decode(img_data["data"])
557
+ image_stream = io.BytesIO(image_bytes)
558
+
559
+ # Create paragraph for image
560
+ p = doc.add_paragraph()
561
+ align = block.get("position", "center")
562
+ if align == "center":
563
+ p.alignment = WD_ALIGN_PARAGRAPH.CENTER
564
+ elif align == "right":
565
+ p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
566
+ else:
567
+ p.alignment = WD_ALIGN_PARAGRAPH.LEFT
568
+
569
+ # Add image
570
+ run = p.add_run()
571
+ run.add_picture(image_stream, width=Inches(4))
572
+
573
+ # Add caption if provided
574
+ caption = block.get("caption", "")
575
+ if caption:
576
+ cap_para = doc.add_paragraph(caption)
577
+ cap_para.alignment = p.alignment
578
+ for run in cap_para.runs:
579
+ run.font.size = Pt(9)
580
+ run.font.italic = True
581
+
582
+ image_counter += 1
583
+ except Exception as e:
584
+ print(f"Failed to insert image: {e}")
585
+ # Add placeholder text
586
+ doc.add_paragraph(f"[Image: {block.get('caption', 'Figure')}]")
587
+ else:
588
+ # No image data available, add placeholder
589
+ doc.add_paragraph(f"[Image: {block.get('caption', 'Figure')}]")
590
+
591
  elif btype == "page_break":
592
  doc.add_page_break()
593
 
 
623
  # -------------------------
624
  @app.post("/extract")
625
  async def route_extract(file: UploadFile = File(...)):
626
+ """Extract plain text, tables, and images from uploaded document."""
627
  try:
628
  fb = await file.read()
629
  kind = detect_file_type(file.filename)
630
+ text, extracted_data = extract_text_and_layout(fb, kind)
631
+ return {
632
+ "text": text,
633
+ "tables_count": len(extracted_data.get("tables", [])),
634
+ "images_count": len(extracted_data.get("images", []))
635
+ }
636
  except Exception as e:
637
  traceback.print_exc()
638
  return JSONResponse({"error": str(e)}, status_code=500)
 
645
  prompt: str = ""
646
  ):
647
  """
648
+ Enhanced document processor with table, image, and emoji support.
 
 
 
 
 
649
  """
650
  try:
651
  fb = await file.read()
652
  kind = detect_file_type(file.filename)
653
+ text, extracted_data = extract_text_and_layout(fb, kind)
654
 
655
  if not text.strip():
656
  return JSONResponse({"error": "No text extracted from document"}, status_code=400)
657
 
658
+ # Enhanced: Pass extracted data to Gemini
659
+ raw = enhance_text_with_gemini(text, doc_type, prompt, extracted_data)
660
  cleaned = clean_gemini_json(raw)
661
 
 
662
  try:
663
  test_parse = json.loads(cleaned)
664
  if "error" in test_parse:
 
666
  except:
667
  pass
668
 
669
+ # Enhanced: Pass extracted data to DOCX builder
670
+ docx_bytes = build_docx_from_design(cleaned, extracted_data)
671
 
 
672
  safe_filename = sanitize_filename(f"Enhanced_{file.filename or 'document.docx'}")
673
 
674
  return StreamingResponse(
 
693
  prompt: str = ""
694
  ):
695
  """
696
+ Preview with table and image information.
 
697
  """
698
  try:
699
  fb = await file.read()
700
  kind = detect_file_type(file.filename)
701
+ text, extracted_data = extract_text_and_layout(fb, kind)
702
 
703
  if not text.strip():
704
  return JSONResponse({"error": "No text extracted"}, status_code=400)
705
 
706
+ raw = enhance_text_with_gemini(text, doc_type, prompt, extracted_data)
707
  cleaned = clean_gemini_json(raw)
708
 
709
+ return {
710
+ "layout_json": json.loads(cleaned),
711
+ "extracted_data": {
712
+ "tables_count": len(extracted_data.get("tables", [])),
713
+ "images_count": len(extracted_data.get("images", []))
714
+ }
715
+ }
716
  except Exception as e:
717
  traceback.print_exc()
718
  return JSONResponse({"error": str(e)}, status_code=500)
 
724
  doc_type: str = "auto",
725
  prompt: str = ""
726
  ):
727
+ """Legacy endpoint with enhanced features."""
 
 
728
  try:
729
  fb = await file.read()
730
  kind = detect_file_type(file.filename)
731
+ text, extracted_data = extract_text_and_layout(fb, kind)
732
 
733
  if not text.strip():
734
  return JSONResponse({"error": "No text extracted"}, status_code=400)
735
 
736
+ raw = enhance_text_with_gemini(text, doc_type, prompt, extracted_data)
737
  cleaned = clean_gemini_json(raw)
738
+ docx_bytes = build_docx_from_design(cleaned, extracted_data)
739
 
740
  safe_filename = sanitize_filename(f"Professional_{file.filename or 'document.docx'}")
741
 
 
759
  doc_type: str = "auto",
760
  prompt: str = ""
761
  ):
762
+ """Full pipeline with enhanced features."""
 
 
 
763
  try:
764
  fb = await file.read()
765
  kind = detect_file_type(file.filename)
766
+ text, extracted_data = extract_text_and_layout(fb, kind)
767
 
768
  if not text.strip():
769
  return JSONResponse({"error": "No text extracted"}, status_code=400)
770
 
771
+ raw = enhance_text_with_gemini(text, doc_type, prompt, extracted_data)
772
  cleaned = clean_gemini_json(raw)
773
+ docx_bytes = build_docx_from_design(cleaned, extracted_data)
774
 
775
  safe_filename = sanitize_filename(f"Enhanced_{file.filename or 'document.docx'}")
776
 
 
791
  @app.post("/add-signature")
792
  async def route_add_signature(
793
  file: UploadFile = File(...),
794
+ signature: str = Form(...),
795
+ position: str = Form("bottom-right"),
796
  signer_name: Optional[str] = Form(None)
797
  ):
798
  """
 
825
  except Exception as e:
826
  return JSONResponse({"error": f"Invalid signature image: {str(e)}"}, status_code=400)
827
 
 
 
 
828
  # Add signature at the end of document
 
829
  doc.add_paragraph()
830
 
831
  # Create signature paragraph
 
877
  @app.get("/")
878
  def root():
879
  return {
880
+ "service": "Universal Document Enhancer v6",
881
  "status": "ok",
882
+ "description": "AI-powered document formatter with table, image, emoji, and icon support",
883
  "supported_types": ["Resume/CV", "Cover Letter", "Report", "Article", "Essay", "Notes", "Any text document"],
884
+ "new_features": [
885
+ "✓ Table extraction and formatting",
886
+ "✓ Image extraction and insertion",
887
+ "✓ Emoji and Unicode symbol preservation (😊, ★, ✓, →)",
888
+ "✓ Enhanced formatting with icons",
889
+ "✓ Multi-column table support",
890
+ "✓ Image captions and positioning"
891
+ ],
892
  "endpoints": {
893
+ "/extract": "Extract text, tables, and images from document",
894
+ "/enhance": "Full pipeline: extract + AI enhancement + DOCX with tables/images (RECOMMENDED)",
895
  "/add-signature": "Add signature to existing DOCX file",
896
+ "/preview": "Preview JSON layout with table/image counts",
897
  "/design": "Same as /enhance (legacy)",
898
  "/full": "Same as /enhance (legacy)"
899
  },
900
  "usage": {
901
  "basic": "POST /enhance with file upload",
902
+ "with_prompt": "POST /enhance?prompt=your_instructions&doc_type=auto",
903
+ "supported_content": [
904
+ "Text with emojis (😊🎉❤️)",
905
+ "Unicode symbols (★✓→•©®™)",
906
+ "Special chars (€£¥°±)",
907
+ "Tables (with headers)",
908
+ "Images (inline)",
909
+ "Lists (bullet/numbered)",
910
+ "Headers and formatting"
911
+ ]
912
  }
913
  }
914
 
 
918
  return {
919
  "status": "healthy",
920
  "api_keys_configured": len(GEMINI_API_KEYS),
921
+ "version": "6.0 - Enhanced with Tables, Images, Emojis & Icons",
922
+ "features": {
923
+ "tables": "✓ Supported",
924
+ "images": "✓ Supported",
925
+ "emojis": "✓ Supported",
926
+ "unicode": "✓ Supported",
927
+ "multi_api_keys": f"✓ {len(GEMINI_API_KEYS)} keys configured"
928
+ }
929
+ }
930
+
931
+
932
+ if __name__ == "__main__":
933
+ import uvicorn
934
+ uvicorn.run(app, host="0.0.0.0", port=7860)