import gradio as gr from paddleocr import PaddleOCR from PIL import Image, ImageDraw from typing import Dict # Khởi tạo OCR ocr = PaddleOCR( text_detection_model_name="PP-OCRv5_mobile_det", text_recognition_model_name="PP-OCRv5_mobile_rec", use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=True ) def replace_text(img: Image.Image, regions: list, new_text="NEW", text_color="white", box_color=None, font=None) -> Image.Image: draw = ImageDraw.Draw(img) for region in regions: bbox = region.get("bbox", {}) if not bbox: continue x1, y1, x2, y2 = bbox["x1"], bbox["y1"], bbox["x2"], bbox["y2"] # add background draw.rectangle([x1, y1, x2, y2], fill=(0, 0, 0)) # add new text draw.text((x1, y1), new_text, fill=text_color, font=font) # Tuỳ chọn khoanh vùng if box_color: draw.rectangle([x1, y1, x2, y2], outline=box_color, width=2) return img def draw_ocr(image: Image.Image, text_regions: list, box_color="red", text_color="yellow") -> Image.Image: draw = ImageDraw.Draw(image) for region in text_regions: bbox = region.get("bbox", {}) text = region.get("text", "") confidence = region.get("confidence", 0) if bbox: x1, y1 = bbox["x1"], bbox["y1"] x2, y2 = bbox["x2"], bbox["y2"] draw.rectangle([x1, y1, x2, y2], outline=box_color, width=2) # draw.text((x1, max(y1 - 12, 0)), f"{text} ({confidence})", fill=text_color) return image def extract_json(result_json: Dict, include_bbox: bool = False) -> Dict: data = result_json.get("res", {}) texts = data.get("rec_texts", []) scores = data.get("rec_scores", []) boxes = data.get("rec_boxes", []) if include_bbox else [] clean_texts = [t.strip() for t, s in zip(texts, scores) if t and t.strip() and s > 0.3] text_regions = [ { "text": t, "bbox": { "x1": int(b[0]), "y1": int(b[1]), "x2": int(b[2]), "y2": int(b[3]), "width": int(b[2] - b[0]), "height": int(b[3] - b[1]) }, "confidence": round(float(s), 3) } for i, (t, s) in enumerate(zip(texts, scores)) if t and t.strip() and s > 0.3 for b in ([boxes[i]] if include_bbox and i < len(boxes) and len(boxes[i]) >= 4 else [None]) if b is not None or not include_bbox ] return { "extracted_text": "\n".join(clean_texts), "text_count": len(clean_texts), "avg_confidence": round(sum(scores) / len(scores), 2) if scores else 0, **({"text_regions": text_regions} if include_bbox else {}) } def inference(img, lang=None): res = ocr.predict(img) r = res[0] data = extract_json(r.json, include_bbox=True) pil_img = Image.open(img).convert("RGB") im_show = draw_ocr(pil_img, data.get("text_regions", [])) img_w, img_h = pil_img.size return im_show, data.get("extracted_text") # ===== Giao diện Gradio ===== title = "OCR" description = """ Support Chinese, Japanese, Korean. """ demo = gr.Interface( fn=inference, inputs=[gr.Image(type="filepath", label="Upload ảnh")], outputs=[gr.Image(type="pil", label="Output"), gr.Textbox(label="Text")], title=title, description=description ) if __name__ == "__main__": demo.launch()