rag / ocr_extractor.py
manarsaber11's picture
Upload 7 files
b6c0274 verified
"""
LLM-based OCR using Groq vision model.
Supports: PDF (text + scanned), Images, DOCX, TXT
Arabic + English
"""
import io
import base64
from pathlib import Path
def extract_text_with_llm(file_bytes: bytes, filename: str, groq, vision_model: str) -> str:
ext = Path(filename).suffix.lower()
if ext == ".txt":
return _extract_txt(file_bytes)
elif ext == ".docx":
return _extract_docx(file_bytes)
elif ext == ".pdf":
return _extract_pdf(file_bytes, groq, vision_model)
elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"]:
return _llm_ocr_image(file_bytes, "image/png", groq, vision_model)
else:
raise ValueError(f"Unsupported file type: {ext}")
# ─── TXT ─────────────────────────────────────────────────
def _extract_txt(file_bytes: bytes) -> str:
try:
return file_bytes.decode("utf-8")
except UnicodeDecodeError:
return file_bytes.decode("latin-1", errors="ignore")
# ─── DOCX ────────────────────────────────────────────────
def _extract_docx(file_bytes: bytes) -> str:
import docx
doc = docx.Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
# ─── PDF ─────────────────────────────────────────────────
def _extract_pdf(file_bytes: bytes, groq, vision_model: str) -> str:
import fitz # PyMuPDF
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = ""
for page in doc:
page_text = page.get_text("text")
text += page_text + "\n"
doc.close()
# If no text found β†’ scanned PDF β†’ use LLM OCR on each page image
if not text.strip():
text = _ocr_pdf_pages_with_llm(file_bytes, groq, vision_model)
return text
def _ocr_pdf_pages_with_llm(file_bytes: bytes, groq, vision_model: str) -> str:
import fitz
from PIL import Image
doc = fitz.open(stream=file_bytes, filetype="pdf")
texts = []
for page in doc:
mat = fitz.Matrix(2.0, 2.0) # 2x zoom = ~144 DPI
pix = page.get_pixmap(matrix=mat)
img_bytes = pix.tobytes("png")
page_text = _llm_ocr_image(img_bytes, "image/png", groq, vision_model)
texts.append(page_text)
doc.close()
return "\n\n".join(texts)
# ─── LLM Vision OCR ──────────────────────────────────────
def _llm_ocr_image(image_bytes: bytes, media_type: str, groq, vision_model: str) -> str:
b64 = base64.standard_b64encode(image_bytes).decode("utf-8")
response = groq.chat.completions.create(
model=vision_model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:{media_type};base64,{b64}"
},
},
{
"type": "text",
"text": (
"Extract ALL text from this image exactly as written. "
"Support both Arabic (right-to-left) and English text. "
"Preserve paragraphs and line breaks. "
"Return ONLY the extracted text, nothing else."
),
},
],
}
],
temperature=0,
)
return response.choices[0].message.content.strip()