Spaces:

mlbench123
/

ocr-extraction

Sleeping

App Files Files Community

ocr-extraction / app.py

mlbench123

Update app.py

e0549ff verified about 1 month ago

Raw

History Blame Contribute Delete

22.5 kB

	"""
	app.py – OCR Route Data Extraction \| Hugging Face Space
	============================================================

	Pipeline
	--------
	Stage 1 PaddleOCR
	Deep-learning OCR — far more accurate than EasyOCR for photos.
	Returns word-level bounding boxes + text.
	Words are clustered into horizontal row-bands and sorted L→R
	to produce one clean text line per table row.

	Stage 2 Qwen/Qwen2.5-72B-Instruct (HF serverless Inference API, GPU)
	Receives row-organised text + strict JSON schema prompt.
	Returns ONE complete structured JSON array in a single call.
	The model corrects OCR typos, understands table context, and
	classifies each instruction into a navigation constraint.

	No hand-written column parsers. No regex tables.
	The LLM understands meaning; PaddleOCR provides accurate characters.
	"""

	from __future__ import annotations
	import datetime, json, logging, os, re, time
	from statistics import median

	import cv2
	import gradio as gr
	import numpy as np
	from PIL import Image
	from huggingface_hub import InferenceClient

	logging.basicConfig(level=logging.INFO,
	format="%(asctime)s \| %(levelname)s \| %(message)s")
	log = logging.getLogger(__name__)

	LLM_MODEL = "Qwen/Qwen2.5-72B-Instruct"


	# ──────────────────────────────────────────────────────────
	# STAGE 1a – IMAGE PREPROCESSING
	# ──────────────────────────────────────────────────────────

	def preprocess(pil_img: Image.Image) -> np.ndarray:
	"""
	Convert PIL → BGR numpy, upscale, sharpen, denoise, threshold.
	Returns a high-contrast grayscale image for best OCR accuracy.
	"""
	img = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
	h, w = img.shape[:2]

	# Upscale to at least 2800px long side
	if max(h, w) < 2800:
	s = 2800 / max(h, w)
	img = cv2.resize(img, None, fx=s, fy=s, interpolation=cv2.INTER_CUBIC)

	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Unsharp mask – recovers thin font strokes
	blur = cv2.GaussianBlur(gray, (0, 0), 3)
	gray = cv2.addWeighted(gray, 1.5, blur, -0.5, 0)

	# Denoise
	gray = cv2.fastNlMeansDenoising(gray, h=10)

	# Adaptive threshold – handles uneven lighting from phone camera
	thresh = cv2.adaptiveThreshold(
	gray, 255,
	cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY,
	blockSize=21, C=8
	)

	# Deskew up to ±5°
	coords = np.column_stack(np.where(thresh < 128))
	if len(coords) > 100:
	angle = cv2.minAreaRect(coords)[-1]
	if angle < -45: angle += 90
	if abs(angle) > 0.3:
	h2, w2 = thresh.shape
	M = cv2.getRotationMatrix2D((w2 // 2, h2 // 2), angle, 1.0)
	thresh = cv2.warpAffine(thresh, M, (w2, h2),
	flags=cv2.INTER_CUBIC,
	borderMode=cv2.BORDER_REPLICATE)
	return thresh


	# ──────────────────────────────────────────────────────────
	# STAGE 1b – PaddleOCR (lazy-loaded once)
	# ──────────────────────────────────────────────────────────

	_ocr_engine = None
	_ocr_name = "none"


	def get_ocr():
	global _ocr_engine, _ocr_name
	if _ocr_engine is not None:
	return _ocr_engine, _ocr_name

	# ── Try PaddleOCR first (best accuracy for document photos) ──
	try:
	from paddleocr import PaddleOCR
	log.info("Loading PaddleOCR...")
	_ocr_engine = PaddleOCR(
	use_textline_orientation=True,
	lang="en",
	text_det_thresh=0.3,
	text_det_box_thresh=0.5,
	)
	_ocr_name = "PaddleOCR"
	log.info("PaddleOCR ready.")
	return _ocr_engine, _ocr_name
	except Exception as e:
	log.warning("PaddleOCR unavailable (%s) — falling back to EasyOCR", e)

	# ── Fallback: EasyOCR ────────────────────────────────────────
	try:
	import easyocr
	log.info("Loading EasyOCR...")
	_ocr_engine = easyocr.Reader(["en"], gpu=False, verbose=False)
	_ocr_name = "EasyOCR"
	log.info("EasyOCR ready.")
	return _ocr_engine, _ocr_name
	except Exception as e:
	raise RuntimeError(f"No OCR engine available: {e}")


	def _run_paddle(engine, img: np.ndarray) -> list[tuple]:
	"""
	Run PaddleOCR v3 predict() and return normalised (bbox, text, conf) tuples.
	Handles multiple result formats defensively.
	"""
	detections = []
	try:
	raw = list(engine.predict(img)) # materialise generator
	except Exception as e:
	raise RuntimeError(f"PaddleOCR.predict failed: {e}") from e

	for res in raw:
	# ── Try PaddleOCR v3 key access ──────────────────────────
	boxes = texts = scores = None
	for box_key in ("dt_polys", "dt_boxes"):
	try:
	boxes = list(res[box_key])
	texts = list(res["rec_texts"])
	scores = list(res["rec_scores"])
	break
	except (KeyError, TypeError, IndexError):
	continue

	if boxes is None:
	# ── Fallback: v2 style [[bbox, (text, conf)], ...] ──
	try:
	for item in res:
	if not (isinstance(item, (list, tuple)) and len(item) == 2):
	continue
	bbox_raw, text_conf = item
	if not (isinstance(text_conf, (list, tuple)) and len(text_conf) == 2):
	continue
	text, conf = text_conf
	text = str(text).strip()
	if text and float(conf) > 0.3:
	bbox = bbox_raw.tolist() if hasattr(bbox_raw, "tolist") else list(bbox_raw)
	detections.append((bbox, text, float(conf)))
	except Exception as e:
	log.warning("Skipping unparseable OCR result: %s", e)
	continue

	# ── Parse v3 boxes/texts/scores ───────────────────────────
	for bbox_raw, text, conf in zip(boxes, texts, scores):
	text = str(text).strip()
	if not text or float(conf) < 0.3:
	continue
	# Convert numpy array → plain list of [x,y] points
	if hasattr(bbox_raw, "tolist"):
	bbox = bbox_raw.tolist()
	elif isinstance(bbox_raw, (list, tuple)):
	bbox = [[float(c) for c in pt] for pt in bbox_raw]
	else:
	continue
	detections.append((bbox, text, float(conf)))

	log.info("PaddleOCR returned %d detections", len(detections))
	return detections


	def _run_easyocr(engine, img: np.ndarray) -> list[tuple]:
	"""Run EasyOCR and normalise output to (bbox, text, conf) tuples."""
	results = engine.readtext(img, detail=1, paragraph=False)
	return [(bbox, text, float(conf))
	for bbox, text, conf in results
	if text.strip() and float(conf) > 0.3]


	def run_ocr(img: np.ndarray) -> tuple[list[tuple], str]:
	engine, name = get_ocr()
	if name == "PaddleOCR":
	dets = _run_paddle(engine, img)
	else:
	dets = _run_easyocr(engine, img)
	log.info("[%s] %d detections", name, len(dets))
	return dets, name


	# ──────────────────────────────────────────────────────────
	# STAGE 1c – ROW BAND CLUSTERING
	# ──────────────────────────────────────────────────────────

	def _cx(bbox) -> float:
	pts = bbox.tolist() if hasattr(bbox, "tolist") else bbox
	return sum(float(p[0]) for p in pts) / len(pts)

	def _cy(bbox) -> float:
	pts = bbox.tolist() if hasattr(bbox, "tolist") else bbox
	return sum(float(p[1]) for p in pts) / len(pts)


	_HEADER_PAT = re.compile(r"^\s(miles\|route\|distance\|time\|to)\s$", re.I)


	def detections_to_rows(detections: list[tuple]) -> list[str]:
	"""
	Cluster bounding-box detections into horizontal row bands.
	Return one text string per row, tokens sorted left → right.

	Example output line:
	"9.90 SL8EFR n Merge onto SL8SFR ne [BW 8SFR] 43.00 01:00"
	"""
	if not detections:
	return []

	# Sort all detections by y-centre
	items = sorted(
	[(_cy(b), _cx(b), t) for b, t, _ in detections],
	key=lambda x: x[0]
	)

	# Median gap between consecutive detections → row-separation threshold
	gaps = [items[i+1][0] - items[i][0] for i in range(len(items) - 1)]
	line_h = max(1.0, median(gaps) if gaps else 20)
	row_thr = max(line_h * 0.65, 10)

	bands: list[list] = []
	cur = [items[0]]
	for item in items[1:]:
	if item[0] - cur[-1][0] > row_thr:
	bands.append(cur)
	cur = [item]
	else:
	cur.append(item)
	bands.append(cur)

	rows = []
	for band in bands:
	# Sort tokens left → right within the band
	line = " ".join(t for _, _, t in sorted(band, key=lambda x: x[1]))
	# Skip header bands
	if _HEADER_PAT.search(line.strip()):
	continue
	rows.append(line)

	log.info("Row clustering produced %d rows", len(rows))
	return rows


	# ──────────────────────────────────────────────────────────
	# STAGE 2 – LLM (Qwen2.5-72B via HF Inference API, GPU)
	# ──────────────────────────────────────────────────────────

	SYSTEM_PROMPT = """You are a strict route data extraction engine for permit documents.

	INPUT: Raw OCR text from a route table image. Each numbered line is one table row.
	The columns are: Segment Miles \| Road/Route \| Navigation Instruction \| Cumulative Miles \| Time

	YOUR TASK: Parse every row and return ONE valid JSON array. Nothing else.

	━━━ OUTPUT FORMAT ━━━
	Start with [ and end with ].
	No explanation, no markdown, no code fences.

	Each element in the array:
	{
	"step": <integer, 1-based sequential>,
	"segment_miles": <float, distance for this segment>,
	"road": <string, road/highway identifier e.g. "SL8EFR n", "IH45 n">,
	"instruction": <string, clean navigation text>,
	"cumulative_miles": <float, total distance from start>,
	"time": <string, "HH:MM" format>,
	"constraints": [
	{
	"type": <"mandatory_action" \| "restriction" \| "conditional_rule">,
	"action": <string>,
	"location": <string, road or junction name>,
	"priority": <"hard" \| "soft">,
	"condition": <string or null>
	}
	]
	}

	━━━ FIELD RULES ━━━
	segment_miles : small decimal at START of line (e.g. 9.90, 0.20, 214.10)
	road : highway code after segment miles (e.g. "SL8EFR n", "IH45 n", "US287 Ramp nw")
	instruction : the navigation sentence in the middle (longest text part)
	cumulative_miles: the larger decimal near the END of the line (running total, always > 40)
	time : HH:MM near the end. ONLY accept values where hours 0-23, minutes 0-59.
	If a value like "43.00" appears, it is cumulative_miles not time.
	Fix separators: "01.12" or "01*12" → "01:12"
	If cumulative > 1000, it has a misplaced decimal: 38290 → 382.90

	━━━ OCR CORRECTION ━━━
	Fix these common errors in the instruction and road fields:
	onlo/Onlo → onto/Onto Tum/Tumn → Turn
	lelt/lcli/Ielt → left nighl/righl/rght → right
	loward/l0ward → toward conneclor/conecor → connector
	Straighi/Straighl → Straight Continuo/Conlinue → Continue
	SH1OT/SHTOT/SHTOI → SH101 IH4S → IH45
	IHZO/IH2O → IH20 IHAO/IH4O → IH40
	UST83/UST8J → US183 USZ87 → US287
	SLB → SL8 IH3S/IH3SE → IH35

	━━━ CONSTRAINT RULES ━━━
	Extract exactly 1 constraint per step. Empty array [] only if truly no action.
	mandatory_action → merge, turn_left, turn_right, take_exit, take_ramp,
	take_connector, continue_straight → priority: "hard"
	conditional_rule → keep_left, keep_right → priority: "soft"
	restriction → no_turn, prohibited_action → priority: "hard"

	━━━ EXAMPLES ━━━
	Row: "9.90 SL8EFR n Merge onto SL8SFR ne [BW 8SFR] [WEST SAM HOUSTON PARKWAY] 43.00 01:00"
	→ {"step":1,"segment_miles":9.9,"road":"SL8EFR n","instruction":"Merge onto SL8SFR ne [BW 8SFR] [WEST SAM HOUSTON PARKWAY]","cumulative_miles":43.0,"time":"01:00","constraints":[{"type":"mandatory_action","action":"merge","location":"SL8SFR ne / BW 8SFR","priority":"hard","condition":null}]}

	Row: "0.20 IH45 e Keep left toward IH45 North/Dallas 51.30 01:13"
	→ {"step":5,"segment_miles":0.2,"road":"IH45 e","instruction":"Keep left toward IH45 North/Dallas","cumulative_miles":51.3,"time":"01:13","constraints":[{"type":"conditional_rule","action":"keep_left","location":"IH45 North / Dallas","priority":"soft","condition":"heading toward IH45 North/Dallas"}]}"""


	def call_llm(row_lines: list[str]) -> str:
	table_text = "\n".join(f"{i+1}. {line}" for i, line in enumerate(row_lines))

	token = os.environ.get("HF_TOKEN", "")
	client = InferenceClient(token=token if token else None)

	log.info("Calling %s with %d rows ...", LLM_MODEL, len(row_lines))
	t0 = time.perf_counter()

	response = client.chat_completion(
	model=LLM_MODEL,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content":
	f"OCR rows from route document:\n{table_text}\n\nReturn the complete JSON array:"},
	],
	max_tokens=8000,
	temperature=0.01,
	)

	raw = response.choices[0].message.content.strip()
	log.info("LLM call finished in %.1fs", time.perf_counter() - t0)
	return raw


	def parse_llm_json(raw: str) -> list[dict]:
	raw = re.sub(r"```(?:json)?", "", raw, flags=re.I).strip()
	start = raw.find("[")
	if start == -1:
	raise ValueError("LLM response contains no JSON array")
	depth = 0
	for i, ch in enumerate(raw[start:], start):
	depth += (ch == "[") - (ch == "]")
	if depth == 0:
	return json.loads(raw[start: i + 1])
	return json.loads(raw[start:].rstrip(",") + "]")


	# ──────────────────────────────────────────────────────────
	# POST-PROCESSING – normalise types, fix edge cases
	# ──────────────────────────────────────────────────────────

	_TIME_RE = re.compile(r"\b(\d{1,2})[.:;*,](\d{2})\b")


	def _fix_time(v: str) -> str:
	for m in _TIME_RE.finditer(str(v)):
	h, mn = int(m.group(1)), int(m.group(2))
	if 0 <= h <= 23 and 0 <= mn <= 59:
	return f"{h:02d}:{mn:02d}"
	return "00:00"


	def _fix_miles(v) -> float:
	try:
	f = float(str(v).replace(",", "."))
	return round(f / 100 if f > 1000 else f, 2)
	except (ValueError, TypeError):
	return 0.0


	_VALID_TYPES = {"mandatory_action", "restriction", "conditional_rule"}
	_VALID_PRIO = {"hard", "soft"}


	def clean_steps(steps: list[dict]) -> list[dict]:
	out = []
	for i, s in enumerate(steps):
	s["step"] = i + 1
	s["segment_miles"] = _fix_miles(s.get("segment_miles", 0))
	s["cumulative_miles"] = _fix_miles(s.get("cumulative_miles", 0))
	s["time"] = _fix_time(s.get("time", ""))
	s.setdefault("road", "UNKNOWN")
	s.setdefault("instruction", "")

	clean_c = []
	for c in s.get("constraints", []):
	if not isinstance(c, dict): continue
	c["type"] = c.get("type", "mandatory_action")
	c["priority"] = c.get("priority", "hard")
	if c["type"] not in _VALID_TYPES: c["type"] = "mandatory_action"
	if c["priority"] not in _VALID_PRIO: c["priority"] = "hard"
	c.setdefault("action", "")
	c.setdefault("location", "")
	c.setdefault("condition", None)
	clean_c.append(c)
	s["constraints"] = clean_c
	out.append(s)
	return out


	# ──────────────────────────────────────────────────────────
	# MAIN PIPELINE
	# ──────────────────────────────────────────────────────────

	def run_pipeline(image, progress=gr.Progress(track_tqdm=True)):
	if image is None:
	return '{"error": "No image provided."}', ""

	t0 = time.perf_counter()

	# ── Stage 1: preprocess ──────────────────────────────
	progress(0.05, desc="Preprocessing image...")
	processed = preprocess(image)

	# ── Stage 1: OCR ─────────────────────────────────────
	progress(0.15, desc="Running OCR (PaddleOCR)...")
	try:
	detections, ocr_name = run_ocr(processed)
	except Exception as e:
	return json.dumps({"error": f"OCR failed: {e}"}), ""

	if not detections:
	return '{"error": "OCR returned no text. Try a clearer image."}', ""

	# ── Stage 1: row clustering ───────────────────────────
	progress(0.35, desc="Organising rows...")
	row_lines = detections_to_rows(detections)
	if not row_lines:
	return '{"error": "No table rows found after clustering."}', ""

	debug = "\n".join(f"[row {i+1:02d}] {r}" for i, r in enumerate(row_lines))

	# ── Stage 2: LLM ─────────────────────────────────────
	progress(0.50, desc=f"Sending {len(row_lines)} rows to LLM...")
	try:
	raw_llm = call_llm(row_lines)
	except Exception as e:
	log.error("LLM error: %s", e)
	return json.dumps({"error": f"LLM API failed: {e}", "ocr_rows": row_lines}), debug

	# ── Parse + clean ─────────────────────────────────────
	progress(0.90, desc="Parsing JSON response...")
	try:
	steps = parse_llm_json(raw_llm)
	except Exception as e:
	log.error("JSON parse error: %s \| raw: %.300s", e, raw_llm)
	return json.dumps({
	"error": f"LLM returned invalid JSON: {e}",
	"raw_output": raw_llm[:1000],
	}), debug

	steps = clean_steps(steps)

	last_cum = max((s["cumulative_miles"] for s in steps), default=0.0)
	last_time = next((s["time"] for s in reversed(steps) if s["time"] != "00:00"), "00:00")

	result = {
	"source": f"uploaded_{datetime.datetime.utcnow().strftime('%H%M%S')}.png",
	"extracted_at": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
	"ocr_engine": ocr_name,
	"llm_model": LLM_MODEL,
	"total_steps": len(steps),
	"total_miles": last_cum,
	"total_time": last_time,
	"steps": steps,
	}

	log.info("Pipeline done in %.1fs — %d steps", time.perf_counter() - t0, len(steps))
	return json.dumps(result, indent=2, ensure_ascii=False), debug


	# ──────────────────────────────────────────────────────────
	# GRADIO UI
	# ──────────────────────────────────────────────────────────

	with gr.Blocks(title="OCR Route Extraction") as demo:
	demo.queue()
	gr.Markdown(f"""
	## OCR Route Data Extraction Pipeline

	\| Stage \| Component \| Role \|
	\|-------\|-----------\|------\|
	\| 1 \| PaddleOCR (local) \| Deep-learning OCR → word bounding boxes \|
	\| 2 \| Row clustering \| Groups words into table rows by y-position \|
	\| 3 \| {LLM_MODEL} (HF GPU) \| Row text → complete structured JSON in one call \|

	Constraint types: `mandatory_action` · `restriction` · `conditional_rule`
	""")

	with gr.Row():
	with gr.Column(scale=1):
	img_input = gr.Image(type="pil", label="Upload Route Document Image", height=460)
	run_btn = gr.Button("Extract Route Data", variant="primary", size="lg")

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.Tab("JSON Output"):
	json_out = gr.Code(language="json", label="Structured JSON", lines=34)
	with gr.Tab("OCR Rows (sent to LLM)"):
	ocr_out = gr.Textbox(
	label="Row-organised text — exactly what the LLM receives",
	lines=26, max_lines=60,
	)

	run_btn.click(
	fn=run_pipeline,
	inputs=[img_input],
	outputs=[json_out, ocr_out],
	api_name=False,
	)

	gr.Examples(examples=[["route_sample.png"]], inputs=[img_input],
	label="Sample route image")

	if __name__ == "__main__":
	demo.queue()
	demo.launch(theme=gr.themes.Soft(), share=True)