""" LLM-based OCR using Groq vision model. Supports: PDF (text + scanned), Images, DOCX, TXT Arabic + English """ import io import base64 from pathlib import Path def extract_text_with_llm(file_bytes: bytes, filename: str, groq, vision_model: str) -> str: ext = Path(filename).suffix.lower() if ext == ".txt": return _extract_txt(file_bytes) elif ext == ".docx": return _extract_docx(file_bytes) elif ext == ".pdf": return _extract_pdf(file_bytes, groq, vision_model) elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"]: return _llm_ocr_image(file_bytes, "image/png", groq, vision_model) else: raise ValueError(f"Unsupported file type: {ext}") # ─── TXT ───────────────────────────────────────────────── def _extract_txt(file_bytes: bytes) -> str: try: return file_bytes.decode("utf-8") except UnicodeDecodeError: return file_bytes.decode("latin-1", errors="ignore") # ─── DOCX ──────────────────────────────────────────────── def _extract_docx(file_bytes: bytes) -> str: import docx doc = docx.Document(io.BytesIO(file_bytes)) return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) # ─── PDF ───────────────────────────────────────────────── def _extract_pdf(file_bytes: bytes, groq, vision_model: str) -> str: import fitz # PyMuPDF doc = fitz.open(stream=file_bytes, filetype="pdf") text = "" for page in doc: page_text = page.get_text("text") text += page_text + "\n" doc.close() # If no text found → scanned PDF → use LLM OCR on each page image if not text.strip(): text = _ocr_pdf_pages_with_llm(file_bytes, groq, vision_model) return text def _ocr_pdf_pages_with_llm(file_bytes: bytes, groq, vision_model: str) -> str: import fitz from PIL import Image doc = fitz.open(stream=file_bytes, filetype="pdf") texts = [] for page in doc: mat = fitz.Matrix(2.0, 2.0) # 2x zoom = ~144 DPI pix = page.get_pixmap(matrix=mat) img_bytes = pix.tobytes("png") page_text = _llm_ocr_image(img_bytes, "image/png", groq, vision_model) texts.append(page_text) doc.close() return "\n\n".join(texts) # ─── LLM Vision OCR ────────────────────────────────────── def _llm_ocr_image(image_bytes: bytes, media_type: str, groq, vision_model: str) -> str: b64 = base64.standard_b64encode(image_bytes).decode("utf-8") response = groq.chat.completions.create( model=vision_model, messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:{media_type};base64,{b64}" }, }, { "type": "text", "text": ( "Extract ALL text from this image exactly as written. " "Support both Arabic (right-to-left) and English text. " "Preserve paragraphs and line breaks. " "Return ONLY the extracted text, nothing else." ), }, ], } ], temperature=0, ) return response.choices[0].message.content.strip()