import gradio as gr import tempfile import os import re import base64 from io import BytesIO from PIL import Image, ImageOps import fitz # PyMuPDF from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from glmocr import GlmOcr # --------------------------------------------------------------------------- # GLM-OCR client — dùng Zhipu MaaS API (không cần GPU) # API key từ biến môi trường ZHIPU_API_KEY # --------------------------------------------------------------------------- def get_ocr_client(): return GlmOcr(config_override={ "pipeline": { "maas": { "enabled": True, "api_key": os.environ.get("ZHIPU_API_KEY", ""), } } }) def ocr_image(pil_img: Image.Image) -> str: """OCR một ảnh PIL, trả về Markdown.""" if pil_img.mode != "RGB": pil_img = pil_img.convert("RGB") pil_img = ImageOps.exif_transpose(pil_img) # Encode sang base64 data URI buf = BytesIO() pil_img.save(buf, format="JPEG", quality=90) b64 = base64.standard_b64encode(buf.getvalue()).decode() data_uri = f"data:image/jpeg;base64,{b64}" try: with get_ocr_client() as parser: result = parser.parse(data_uri) return result.markdown or "*(Không nhận diện được)*" except Exception as e: return f"Lỗi OCR: {e}" def ocr_pdf_pages(image_paths: list) -> str: """OCR nhiều trang PDF cùng lúc (list ảnh = 1 tài liệu).""" data_uris = [] for path in image_paths: with open(path, "rb") as f: b64 = base64.standard_b64encode(f.read()).decode() data_uris.append(f"data:image/png;base64,{b64}") try: with get_ocr_client() as parser: result = parser.parse(data_uris) return result.markdown or "*(Không nhận diện được)*" except Exception as e: return f"Lỗi OCR multi-page: {e}" # --------------------------------------------------------------------------- # PDF → ảnh PNG tạm # --------------------------------------------------------------------------- def pdf_to_image_paths(pdf_path: str, page_range: str, dpi: int = 200): doc = fitz.open(pdf_path) total = len(doc) if page_range.strip().lower() == "all": pages = list(range(total)) else: pages = [] for part in page_range.split(","): part = part.strip() if "-" in part: a, b = part.split("-") pages += list(range(int(a) - 1, min(int(b), total))) elif part.isdigit(): pages.append(int(part) - 1) tmp_dir = tempfile.mkdtemp(prefix="pdf_ocr_") mat = fitz.Matrix(dpi / 72, dpi / 72) paths, nums = [], [] for i in pages: if 0 <= i < total: img_path = os.path.join(tmp_dir, f"page_{i+1:04d}.png") doc.load_page(i).get_pixmap(matrix=mat, alpha=False).save(img_path) paths.append(img_path) nums.append(i + 1) doc.close() return paths, nums # --------------------------------------------------------------------------- # Export .docx # --------------------------------------------------------------------------- def export_docx(pages: list) -> str: doc = Document() t = doc.add_heading("Kết quả OCR", level=0) t.alignment = WD_ALIGN_PARAGRAPH.CENTER for item in pages: h = doc.add_heading(item["label"], level=1) h.runs[0].font.color.rgb = RGBColor(0x1A, 0x56, 0xDB) for block in re.split(r"\n{2,}", item["text"]): block = block.strip() if not block: continue hm = re.match(r"^(#{1,3})\s+(.*)", block) if hm: doc.add_heading(hm.group(2), level=len(hm.group(1)) + 1) continue if block.startswith("$$") and block.endswith("$$"): p = doc.add_paragraph(block) p.runs[0].font.name = "Courier New" p.runs[0].font.size = Pt(10) p.alignment = WD_ALIGN_PARAGRAPH.CENTER continue if block.startswith("|"): rows = [r for r in block.split("\n") if r.strip() and not re.match(r"^\|[-| :]+\|$", r.strip())] if rows: cols = [c.strip() for c in rows[0].split("|") if c.strip()] tbl = doc.add_table(rows=len(rows), cols=max(len(cols), 1)) tbl.style = "Table Grid" for ri, row in enumerate(rows): cells = [c.strip() for c in row.split("|") if c.strip()] for ci, cell in enumerate(cells[: len(cols)]): tbl.rows[ri].cells[ci].text = cell continue p = doc.add_paragraph(block) p.paragraph_format.space_after = Pt(6) doc.add_paragraph() tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx") doc.save(tmp.name) return tmp.name # --------------------------------------------------------------------------- # Pipeline chính # --------------------------------------------------------------------------- def process(files, page_range, pdf_mode, progress=gr.Progress(track_tqdm=True)): api_key = os.environ.get("ZHIPU_API_KEY", "") if not api_key: return "⚠️ Chưa cấu hình ZHIPU_API_KEY trong Secrets!", None, None if not files: return "⚠️ Vui lòng tải lên ít nhất một file.", None, None pages, all_md = [], [] for file in files: path = file.name name = os.path.basename(path) ext = name.lower().rsplit(".", 1)[-1] if ext == "pdf": try: img_paths, pg_nums = pdf_to_image_paths(path, page_range) except Exception as e: pages.append({"label": f"📄 {name}", "text": f"Lỗi đọc PDF: {e}"}) continue if pdf_mode == "🚀 Multi-page (1 lần — nhanh hơn)": label = f"📄 {name}" text = ocr_pdf_pages(img_paths) pages.append({"label": label, "text": text}) all_md.append(f"## {label}\n\n{text}") else: for pg_num, img_path in progress.tqdm(list(zip(pg_nums, img_paths)), desc=name): label = f"📄 {name} — Trang {pg_num}" try: img = Image.open(img_path) text = ocr_image(img) except Exception as e: text = f"Lỗi trang {pg_num}: {e}" pages.append({"label": label, "text": text}) all_md.append(f"## {label}\n\n{text}") elif ext in ("png", "jpg", "jpeg", "webp", "bmp", "tiff"): label = f"🖼️ {name}" try: img = Image.open(path) text = ocr_image(img) except Exception as e: text = f"Lỗi: {e}" pages.append({"label": label, "text": text}) all_md.append(f"## {label}\n\n{text}") else: pages.append({"label": name, "text": "⚠️ Định dạng không hỗ trợ."}) combined = "\n\n---\n\n".join(all_md) docx_path = None try: docx_path = export_docx(pages) except Exception: pass md_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") md_tmp.write(combined) md_tmp.close() return combined, docx_path, md_tmp.name # --------------------------------------------------------------------------- # Gradio UI # --------------------------------------------------------------------------- CSS = """ body { background: #0f1117 !important; } .gradio-container { background: #0f1117 !important; font-family: 'Inter', sans-serif !important; } .header { text-align:center; padding:2rem 1rem 1rem; } .header h1 { font-size:2rem; font-weight:700; background:linear-gradient(90deg,#4f8ef7,#7c3aed); -webkit-background-clip:text; -webkit-text-fill-color:transparent; margin:0 0 .3rem; } .header p { color:#64748b; font-size:.9rem; margin:0; } .badge { display:inline-block; background:#1a1d27; border:1px solid #2d3148; border-radius:20px; padding:.2rem .7rem; font-size:.75rem; color:#4f8ef7; margin:.2rem; } """ with gr.Blocks(title="OCR Pro", css=CSS) as demo: gr.HTML("""
Nhận diện văn bản & công thức Toán từ PDF / ảnh — xuất Markdown & Word