AP-ocr / app.py
Hong-2's picture
Upload 3 files
5022218 verified
Raw
History Blame Contribute Delete
10.5 kB
import gradio as gr
import tempfile
import os
import re
import base64
from io import BytesIO
from PIL import Image, ImageOps
import fitz # PyMuPDF
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from glmocr import GlmOcr
# ---------------------------------------------------------------------------
# GLM-OCR client — dùng Zhipu MaaS API (không cần GPU)
# API key từ biến môi trường ZHIPU_API_KEY
# ---------------------------------------------------------------------------
def get_ocr_client():
return GlmOcr(config_override={
"pipeline": {
"maas": {
"enabled": True,
"api_key": os.environ.get("ZHIPU_API_KEY", ""),
}
}
})
def ocr_image(pil_img: Image.Image) -> str:
"""OCR một ảnh PIL, trả về Markdown."""
if pil_img.mode != "RGB":
pil_img = pil_img.convert("RGB")
pil_img = ImageOps.exif_transpose(pil_img)
# Encode sang base64 data URI
buf = BytesIO()
pil_img.save(buf, format="JPEG", quality=90)
b64 = base64.standard_b64encode(buf.getvalue()).decode()
data_uri = f"data:image/jpeg;base64,{b64}"
try:
with get_ocr_client() as parser:
result = parser.parse(data_uri)
return result.markdown or "*(Không nhận diện được)*"
except Exception as e:
return f"Lỗi OCR: {e}"
def ocr_pdf_pages(image_paths: list) -> str:
"""OCR nhiều trang PDF cùng lúc (list ảnh = 1 tài liệu)."""
data_uris = []
for path in image_paths:
with open(path, "rb") as f:
b64 = base64.standard_b64encode(f.read()).decode()
data_uris.append(f"data:image/png;base64,{b64}")
try:
with get_ocr_client() as parser:
result = parser.parse(data_uris)
return result.markdown or "*(Không nhận diện được)*"
except Exception as e:
return f"Lỗi OCR multi-page: {e}"
# ---------------------------------------------------------------------------
# PDF → ảnh PNG tạm
# ---------------------------------------------------------------------------
def pdf_to_image_paths(pdf_path: str, page_range: str, dpi: int = 200):
doc = fitz.open(pdf_path)
total = len(doc)
if page_range.strip().lower() == "all":
pages = list(range(total))
else:
pages = []
for part in page_range.split(","):
part = part.strip()
if "-" in part:
a, b = part.split("-")
pages += list(range(int(a) - 1, min(int(b), total)))
elif part.isdigit():
pages.append(int(part) - 1)
tmp_dir = tempfile.mkdtemp(prefix="pdf_ocr_")
mat = fitz.Matrix(dpi / 72, dpi / 72)
paths, nums = [], []
for i in pages:
if 0 <= i < total:
img_path = os.path.join(tmp_dir, f"page_{i+1:04d}.png")
doc.load_page(i).get_pixmap(matrix=mat, alpha=False).save(img_path)
paths.append(img_path)
nums.append(i + 1)
doc.close()
return paths, nums
# ---------------------------------------------------------------------------
# Export .docx
# ---------------------------------------------------------------------------
def export_docx(pages: list) -> str:
doc = Document()
t = doc.add_heading("Kết quả OCR", level=0)
t.alignment = WD_ALIGN_PARAGRAPH.CENTER
for item in pages:
h = doc.add_heading(item["label"], level=1)
h.runs[0].font.color.rgb = RGBColor(0x1A, 0x56, 0xDB)
for block in re.split(r"\n{2,}", item["text"]):
block = block.strip()
if not block:
continue
hm = re.match(r"^(#{1,3})\s+(.*)", block)
if hm:
doc.add_heading(hm.group(2), level=len(hm.group(1)) + 1)
continue
if block.startswith("$$") and block.endswith("$$"):
p = doc.add_paragraph(block)
p.runs[0].font.name = "Courier New"
p.runs[0].font.size = Pt(10)
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
continue
if block.startswith("|"):
rows = [r for r in block.split("\n") if r.strip() and not re.match(r"^\|[-| :]+\|$", r.strip())]
if rows:
cols = [c.strip() for c in rows[0].split("|") if c.strip()]
tbl = doc.add_table(rows=len(rows), cols=max(len(cols), 1))
tbl.style = "Table Grid"
for ri, row in enumerate(rows):
cells = [c.strip() for c in row.split("|") if c.strip()]
for ci, cell in enumerate(cells[: len(cols)]):
tbl.rows[ri].cells[ci].text = cell
continue
p = doc.add_paragraph(block)
p.paragraph_format.space_after = Pt(6)
doc.add_paragraph()
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
doc.save(tmp.name)
return tmp.name
# ---------------------------------------------------------------------------
# Pipeline chính
# ---------------------------------------------------------------------------
def process(files, page_range, pdf_mode, progress=gr.Progress(track_tqdm=True)):
api_key = os.environ.get("ZHIPU_API_KEY", "")
if not api_key:
return "⚠️ Chưa cấu hình ZHIPU_API_KEY trong Secrets!", None, None
if not files:
return "⚠️ Vui lòng tải lên ít nhất một file.", None, None
pages, all_md = [], []
for file in files:
path = file.name
name = os.path.basename(path)
ext = name.lower().rsplit(".", 1)[-1]
if ext == "pdf":
try:
img_paths, pg_nums = pdf_to_image_paths(path, page_range)
except Exception as e:
pages.append({"label": f"📄 {name}", "text": f"Lỗi đọc PDF: {e}"})
continue
if pdf_mode == "🚀 Multi-page (1 lần — nhanh hơn)":
label = f"📄 {name}"
text = ocr_pdf_pages(img_paths)
pages.append({"label": label, "text": text})
all_md.append(f"## {label}\n\n{text}")
else:
for pg_num, img_path in progress.tqdm(list(zip(pg_nums, img_paths)), desc=name):
label = f"📄 {name} — Trang {pg_num}"
try:
img = Image.open(img_path)
text = ocr_image(img)
except Exception as e:
text = f"Lỗi trang {pg_num}: {e}"
pages.append({"label": label, "text": text})
all_md.append(f"## {label}\n\n{text}")
elif ext in ("png", "jpg", "jpeg", "webp", "bmp", "tiff"):
label = f"🖼️ {name}"
try:
img = Image.open(path)
text = ocr_image(img)
except Exception as e:
text = f"Lỗi: {e}"
pages.append({"label": label, "text": text})
all_md.append(f"## {label}\n\n{text}")
else:
pages.append({"label": name, "text": "⚠️ Định dạng không hỗ trợ."})
combined = "\n\n---\n\n".join(all_md)
docx_path = None
try:
docx_path = export_docx(pages)
except Exception:
pass
md_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode="w", encoding="utf-8")
md_tmp.write(combined)
md_tmp.close()
return combined, docx_path, md_tmp.name
# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------
CSS = """
body { background: #0f1117 !important; }
.gradio-container { background: #0f1117 !important; font-family: 'Inter', sans-serif !important; }
.header { text-align:center; padding:2rem 1rem 1rem; }
.header h1 { font-size:2rem; font-weight:700;
background:linear-gradient(90deg,#4f8ef7,#7c3aed);
-webkit-background-clip:text; -webkit-text-fill-color:transparent; margin:0 0 .3rem; }
.header p { color:#64748b; font-size:.9rem; margin:0; }
.badge { display:inline-block; background:#1a1d27; border:1px solid #2d3148;
border-radius:20px; padding:.2rem .7rem; font-size:.75rem; color:#4f8ef7; margin:.2rem; }
"""
with gr.Blocks(title="OCR Pro", css=CSS) as demo:
gr.HTML("""
<div class="header">
<h1>⚡ OCR Pro</h1>
<p>Nhận diện văn bản & công thức Toán từ PDF / ảnh — xuất Markdown & Word</p>
<div style="margin-top:.7rem">
<span class="badge">GLM-OCR</span>
<span class="badge">No GPU</span>
<span class="badge">Multi-page PDF</span>
<span class="badge">LaTeX Math</span>
<span class="badge">Export .docx</span>
</div>
</div>
""")
with gr.Row():
with gr.Column(scale=1, min_width=280):
files_in = gr.File(
label="Tải lên PDF hoặc ảnh",
file_count="multiple",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"],
height=160,
)
page_in = gr.Textbox(label="Trang PDF (all / 1-5 / 1,3,5)", value="all")
pdf_mode = gr.Radio(
choices=["🚀 Multi-page (1 lần — nhanh hơn)", "📄 Từng trang (chi tiết hơn)"],
value="🚀 Multi-page (1 lần — nhanh hơn)",
label="Chế độ xử lý PDF",
)
btn = gr.Button("🚀 Nhận diện", variant="primary")
with gr.Column(scale=3):
with gr.Tabs():
with gr.Tab("📝 Markdown / LaTeX"):
md_out = gr.Textbox(lines=25, placeholder="Kết quả hiển thị ở đây...")
with gr.Tab("👁️ Preview"):
preview = gr.Markdown()
with gr.Row():
docx_out = gr.File(label="⬇️ Tải Word (.docx)", interactive=False)
md_file = gr.File(label="⬇️ Tải Markdown (.md)", interactive=False)
def on_run(files, page_range, pdf_mode):
md, docx, mdf = process(files, page_range, pdf_mode)
return md, md, docx, mdf
btn.click(fn=on_run, inputs=[files_in, page_in, pdf_mode],
outputs=[md_out, preview, docx_out, md_file])
demo.launch()