| import gradio as gr |
| import tempfile |
| import os |
| import re |
| import base64 |
| from io import BytesIO |
| from PIL import Image, ImageOps |
| import fitz |
| from docx import Document |
| from docx.shared import Pt, RGBColor |
| from docx.enum.text import WD_ALIGN_PARAGRAPH |
| from glmocr import GlmOcr |
|
|
| |
| |
| |
| |
| def get_ocr_client(): |
| return GlmOcr(config_override={ |
| "pipeline": { |
| "maas": { |
| "enabled": True, |
| "api_key": os.environ.get("ZHIPU_API_KEY", ""), |
| } |
| } |
| }) |
|
|
| def ocr_image(pil_img: Image.Image) -> str: |
| """OCR một ảnh PIL, trả về Markdown.""" |
| if pil_img.mode != "RGB": |
| pil_img = pil_img.convert("RGB") |
| pil_img = ImageOps.exif_transpose(pil_img) |
|
|
| |
| buf = BytesIO() |
| pil_img.save(buf, format="JPEG", quality=90) |
| b64 = base64.standard_b64encode(buf.getvalue()).decode() |
| data_uri = f"data:image/jpeg;base64,{b64}" |
|
|
| try: |
| with get_ocr_client() as parser: |
| result = parser.parse(data_uri) |
| return result.markdown or "*(Không nhận diện được)*" |
| except Exception as e: |
| return f"Lỗi OCR: {e}" |
|
|
| def ocr_pdf_pages(image_paths: list) -> str: |
| """OCR nhiều trang PDF cùng lúc (list ảnh = 1 tài liệu).""" |
| data_uris = [] |
| for path in image_paths: |
| with open(path, "rb") as f: |
| b64 = base64.standard_b64encode(f.read()).decode() |
| data_uris.append(f"data:image/png;base64,{b64}") |
| try: |
| with get_ocr_client() as parser: |
| result = parser.parse(data_uris) |
| return result.markdown or "*(Không nhận diện được)*" |
| except Exception as e: |
| return f"Lỗi OCR multi-page: {e}" |
|
|
| |
| |
| |
| def pdf_to_image_paths(pdf_path: str, page_range: str, dpi: int = 200): |
| doc = fitz.open(pdf_path) |
| total = len(doc) |
| if page_range.strip().lower() == "all": |
| pages = list(range(total)) |
| else: |
| pages = [] |
| for part in page_range.split(","): |
| part = part.strip() |
| if "-" in part: |
| a, b = part.split("-") |
| pages += list(range(int(a) - 1, min(int(b), total))) |
| elif part.isdigit(): |
| pages.append(int(part) - 1) |
| tmp_dir = tempfile.mkdtemp(prefix="pdf_ocr_") |
| mat = fitz.Matrix(dpi / 72, dpi / 72) |
| paths, nums = [], [] |
| for i in pages: |
| if 0 <= i < total: |
| img_path = os.path.join(tmp_dir, f"page_{i+1:04d}.png") |
| doc.load_page(i).get_pixmap(matrix=mat, alpha=False).save(img_path) |
| paths.append(img_path) |
| nums.append(i + 1) |
| doc.close() |
| return paths, nums |
|
|
| |
| |
| |
| def export_docx(pages: list) -> str: |
| doc = Document() |
| t = doc.add_heading("Kết quả OCR", level=0) |
| t.alignment = WD_ALIGN_PARAGRAPH.CENTER |
| for item in pages: |
| h = doc.add_heading(item["label"], level=1) |
| h.runs[0].font.color.rgb = RGBColor(0x1A, 0x56, 0xDB) |
| for block in re.split(r"\n{2,}", item["text"]): |
| block = block.strip() |
| if not block: |
| continue |
| hm = re.match(r"^(#{1,3})\s+(.*)", block) |
| if hm: |
| doc.add_heading(hm.group(2), level=len(hm.group(1)) + 1) |
| continue |
| if block.startswith("$$") and block.endswith("$$"): |
| p = doc.add_paragraph(block) |
| p.runs[0].font.name = "Courier New" |
| p.runs[0].font.size = Pt(10) |
| p.alignment = WD_ALIGN_PARAGRAPH.CENTER |
| continue |
| if block.startswith("|"): |
| rows = [r for r in block.split("\n") if r.strip() and not re.match(r"^\|[-| :]+\|$", r.strip())] |
| if rows: |
| cols = [c.strip() for c in rows[0].split("|") if c.strip()] |
| tbl = doc.add_table(rows=len(rows), cols=max(len(cols), 1)) |
| tbl.style = "Table Grid" |
| for ri, row in enumerate(rows): |
| cells = [c.strip() for c in row.split("|") if c.strip()] |
| for ci, cell in enumerate(cells[: len(cols)]): |
| tbl.rows[ri].cells[ci].text = cell |
| continue |
| p = doc.add_paragraph(block) |
| p.paragraph_format.space_after = Pt(6) |
| doc.add_paragraph() |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx") |
| doc.save(tmp.name) |
| return tmp.name |
|
|
| |
| |
| |
| def process(files, page_range, pdf_mode, progress=gr.Progress(track_tqdm=True)): |
| api_key = os.environ.get("ZHIPU_API_KEY", "") |
| if not api_key: |
| return "⚠️ Chưa cấu hình ZHIPU_API_KEY trong Secrets!", None, None |
|
|
| if not files: |
| return "⚠️ Vui lòng tải lên ít nhất một file.", None, None |
|
|
| pages, all_md = [], [] |
|
|
| for file in files: |
| path = file.name |
| name = os.path.basename(path) |
| ext = name.lower().rsplit(".", 1)[-1] |
|
|
| if ext == "pdf": |
| try: |
| img_paths, pg_nums = pdf_to_image_paths(path, page_range) |
| except Exception as e: |
| pages.append({"label": f"📄 {name}", "text": f"Lỗi đọc PDF: {e}"}) |
| continue |
|
|
| if pdf_mode == "🚀 Multi-page (1 lần — nhanh hơn)": |
| label = f"📄 {name}" |
| text = ocr_pdf_pages(img_paths) |
| pages.append({"label": label, "text": text}) |
| all_md.append(f"## {label}\n\n{text}") |
| else: |
| for pg_num, img_path in progress.tqdm(list(zip(pg_nums, img_paths)), desc=name): |
| label = f"📄 {name} — Trang {pg_num}" |
| try: |
| img = Image.open(img_path) |
| text = ocr_image(img) |
| except Exception as e: |
| text = f"Lỗi trang {pg_num}: {e}" |
| pages.append({"label": label, "text": text}) |
| all_md.append(f"## {label}\n\n{text}") |
|
|
| elif ext in ("png", "jpg", "jpeg", "webp", "bmp", "tiff"): |
| label = f"🖼️ {name}" |
| try: |
| img = Image.open(path) |
| text = ocr_image(img) |
| except Exception as e: |
| text = f"Lỗi: {e}" |
| pages.append({"label": label, "text": text}) |
| all_md.append(f"## {label}\n\n{text}") |
| else: |
| pages.append({"label": name, "text": "⚠️ Định dạng không hỗ trợ."}) |
|
|
| combined = "\n\n---\n\n".join(all_md) |
| docx_path = None |
| try: |
| docx_path = export_docx(pages) |
| except Exception: |
| pass |
| md_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8") |
| md_tmp.write(combined) |
| md_tmp.close() |
| return combined, docx_path, md_tmp.name |
|
|
| |
| |
| |
| CSS = """ |
| body { background: #0f1117 !important; } |
| .gradio-container { background: #0f1117 !important; font-family: 'Inter', sans-serif !important; } |
| .header { text-align:center; padding:2rem 1rem 1rem; } |
| .header h1 { font-size:2rem; font-weight:700; |
| background:linear-gradient(90deg,#4f8ef7,#7c3aed); |
| -webkit-background-clip:text; -webkit-text-fill-color:transparent; margin:0 0 .3rem; } |
| .header p { color:#64748b; font-size:.9rem; margin:0; } |
| .badge { display:inline-block; background:#1a1d27; border:1px solid #2d3148; |
| border-radius:20px; padding:.2rem .7rem; font-size:.75rem; color:#4f8ef7; margin:.2rem; } |
| """ |
|
|
| with gr.Blocks(title="OCR Pro", css=CSS) as demo: |
| gr.HTML(""" |
| <div class="header"> |
| <h1>⚡ OCR Pro</h1> |
| <p>Nhận diện văn bản & công thức Toán từ PDF / ảnh — xuất Markdown & Word</p> |
| <div style="margin-top:.7rem"> |
| <span class="badge">GLM-OCR</span> |
| <span class="badge">No GPU</span> |
| <span class="badge">Multi-page PDF</span> |
| <span class="badge">LaTeX Math</span> |
| <span class="badge">Export .docx</span> |
| </div> |
| </div> |
| """) |
| with gr.Row(): |
| with gr.Column(scale=1, min_width=280): |
| files_in = gr.File( |
| label="Tải lên PDF hoặc ảnh", |
| file_count="multiple", |
| file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"], |
| height=160, |
| ) |
| page_in = gr.Textbox(label="Trang PDF (all / 1-5 / 1,3,5)", value="all") |
| pdf_mode = gr.Radio( |
| choices=["🚀 Multi-page (1 lần — nhanh hơn)", "📄 Từng trang (chi tiết hơn)"], |
| value="🚀 Multi-page (1 lần — nhanh hơn)", |
| label="Chế độ xử lý PDF", |
| ) |
| btn = gr.Button("🚀 Nhận diện", variant="primary") |
| with gr.Column(scale=3): |
| with gr.Tabs(): |
| with gr.Tab("📝 Markdown / LaTeX"): |
| md_out = gr.Textbox(lines=25, placeholder="Kết quả hiển thị ở đây...") |
| with gr.Tab("👁️ Preview"): |
| preview = gr.Markdown() |
| with gr.Row(): |
| docx_out = gr.File(label="⬇️ Tải Word (.docx)", interactive=False) |
| md_file = gr.File(label="⬇️ Tải Markdown (.md)", interactive=False) |
|
|
| def on_run(files, page_range, pdf_mode): |
| md, docx, mdf = process(files, page_range, pdf_mode) |
| return md, md, docx, mdf |
|
|
| btn.click(fn=on_run, inputs=[files_in, page_in, pdf_mode], |
| outputs=[md_out, preview, docx_out, md_file]) |
|
|
| demo.launch() |
|
|