Spaces:
Running
Running
| """ | |
| LLM-based OCR using Groq vision model. | |
| Supports: PDF (text + scanned), Images, DOCX, TXT | |
| Arabic + English | |
| """ | |
| import io | |
| import base64 | |
| from pathlib import Path | |
| def extract_text_with_llm(file_bytes: bytes, filename: str, groq, vision_model: str) -> str: | |
| ext = Path(filename).suffix.lower() | |
| if ext == ".txt": | |
| return _extract_txt(file_bytes) | |
| elif ext == ".docx": | |
| return _extract_docx(file_bytes) | |
| elif ext == ".pdf": | |
| return _extract_pdf(file_bytes, groq, vision_model) | |
| elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"]: | |
| return _llm_ocr_image(file_bytes, "image/png", groq, vision_model) | |
| else: | |
| raise ValueError(f"Unsupported file type: {ext}") | |
| # βββ TXT βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_txt(file_bytes: bytes) -> str: | |
| try: | |
| return file_bytes.decode("utf-8") | |
| except UnicodeDecodeError: | |
| return file_bytes.decode("latin-1", errors="ignore") | |
| # βββ DOCX ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_docx(file_bytes: bytes) -> str: | |
| import docx | |
| doc = docx.Document(io.BytesIO(file_bytes)) | |
| return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) | |
| # βββ PDF βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_pdf(file_bytes: bytes, groq, vision_model: str) -> str: | |
| import fitz # PyMuPDF | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text = "" | |
| for page in doc: | |
| page_text = page.get_text("text") | |
| text += page_text + "\n" | |
| doc.close() | |
| # If no text found β scanned PDF β use LLM OCR on each page image | |
| if not text.strip(): | |
| text = _ocr_pdf_pages_with_llm(file_bytes, groq, vision_model) | |
| return text | |
| def _ocr_pdf_pages_with_llm(file_bytes: bytes, groq, vision_model: str) -> str: | |
| import fitz | |
| from PIL import Image | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| texts = [] | |
| for page in doc: | |
| mat = fitz.Matrix(2.0, 2.0) # 2x zoom = ~144 DPI | |
| pix = page.get_pixmap(matrix=mat) | |
| img_bytes = pix.tobytes("png") | |
| page_text = _llm_ocr_image(img_bytes, "image/png", groq, vision_model) | |
| texts.append(page_text) | |
| doc.close() | |
| return "\n\n".join(texts) | |
| # βββ LLM Vision OCR ββββββββββββββββββββββββββββββββββββββ | |
| def _llm_ocr_image(image_bytes: bytes, media_type: str, groq, vision_model: str) -> str: | |
| b64 = base64.standard_b64encode(image_bytes).decode("utf-8") | |
| response = groq.chat.completions.create( | |
| model=vision_model, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:{media_type};base64,{b64}" | |
| }, | |
| }, | |
| { | |
| "type": "text", | |
| "text": ( | |
| "Extract ALL text from this image exactly as written. " | |
| "Support both Arabic (right-to-left) and English text. " | |
| "Preserve paragraphs and line breaks. " | |
| "Return ONLY the extracted text, nothing else." | |
| ), | |
| }, | |
| ], | |
| } | |
| ], | |
| temperature=0, | |
| ) | |
| return response.choices[0].message.content.strip() | |