Spaces:

manarsaber11
/

rag

Running

App Files Files Community

rag / ocr_extractor.py

manarsaber11

Upload 7 files

b6c0274 verified about 1 month ago

raw

history blame contribute delete

3.86 kB

	"""
	LLM-based OCR using Groq vision model.
	Supports: PDF (text + scanned), Images, DOCX, TXT
	Arabic + English
	"""

	import io
	import base64
	from pathlib import Path


	def extract_text_with_llm(file_bytes: bytes, filename: str, groq, vision_model: str) -> str:
	ext = Path(filename).suffix.lower()

	if ext == ".txt":
	return _extract_txt(file_bytes)

	elif ext == ".docx":
	return _extract_docx(file_bytes)

	elif ext == ".pdf":
	return _extract_pdf(file_bytes, groq, vision_model)

	elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"]:
	return _llm_ocr_image(file_bytes, "image/png", groq, vision_model)

	else:
	raise ValueError(f"Unsupported file type: {ext}")


	# ─── TXT ─────────────────────────────────────────────────
	def _extract_txt(file_bytes: bytes) -> str:
	try:
	return file_bytes.decode("utf-8")
	except UnicodeDecodeError:
	return file_bytes.decode("latin-1", errors="ignore")


	# ─── DOCX ────────────────────────────────────────────────
	def _extract_docx(file_bytes: bytes) -> str:
	import docx
	doc = docx.Document(io.BytesIO(file_bytes))
	return "\n".join(p.text for p in doc.paragraphs if p.text.strip())


	# ─── PDF ─────────────────────────────────────────────────
	def _extract_pdf(file_bytes: bytes, groq, vision_model: str) -> str:
	import fitz # PyMuPDF

	doc = fitz.open(stream=file_bytes, filetype="pdf")
	text = ""

	for page in doc:
	page_text = page.get_text("text")
	text += page_text + "\n"

	doc.close()

	# If no text found → scanned PDF → use LLM OCR on each page image
	if not text.strip():
	text = _ocr_pdf_pages_with_llm(file_bytes, groq, vision_model)

	return text


	def _ocr_pdf_pages_with_llm(file_bytes: bytes, groq, vision_model: str) -> str:
	import fitz
	from PIL import Image

	doc = fitz.open(stream=file_bytes, filetype="pdf")
	texts = []

	for page in doc:
	mat = fitz.Matrix(2.0, 2.0) # 2x zoom = ~144 DPI
	pix = page.get_pixmap(matrix=mat)

	img_bytes = pix.tobytes("png")
	page_text = _llm_ocr_image(img_bytes, "image/png", groq, vision_model)
	texts.append(page_text)

	doc.close()
	return "\n\n".join(texts)


	# ─── LLM Vision OCR ──────────────────────────────────────
	def _llm_ocr_image(image_bytes: bytes, media_type: str, groq, vision_model: str) -> str:
	b64 = base64.standard_b64encode(image_bytes).decode("utf-8")

	response = groq.chat.completions.create(
	model=vision_model,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:{media_type};base64,{b64}"
	},
	},
	{
	"type": "text",
	"text": (
	"Extract ALL text from this image exactly as written. "
	"Support both Arabic (right-to-left) and English text. "
	"Preserve paragraphs and line breaks. "
	"Return ONLY the extracted text, nothing else."
	),
	},
	],
	}
	],
	temperature=0,
	)

	return response.choices[0].message.content.strip()