imperiusrex commited on
Commit
b89622d
·
verified ·
1 Parent(s): aea72b3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import cv2
4
+ import numpy as np
5
+ import torch
6
+ import json
7
+ from PIL import Image
8
+ from transformers import CLIPProcessor, CLIPModel
9
+ from paddleocr import PaddleOCR, TextDetection
10
+
11
+ from ocr_utils import (
12
+ run_text_detection,
13
+ crop_and_warp_regions,
14
+ detect_language_clip,
15
+ run_paddle_ocr,
16
+ group_text_by_position
17
+ )
18
+
19
+ # Load models once
20
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
21
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
22
+
23
+ language_map = {
24
+ "english": "en",
25
+ "telugu": "te",
26
+ "chinese": "ch",
27
+ "korean": "korean"
28
+ }
29
+
30
+ candidates = [
31
+ "This is English text",
32
+ "This is Telugu text",
33
+ "This is Chinese text",
34
+ "This is Korean text"
35
+ ]
36
+
37
+ def process_image(image):
38
+ image_pil = Image.fromarray(image).convert("RGB")
39
+ img_path = "uploaded.jpg"
40
+ image_pil.save(img_path)
41
+
42
+ width, height = image_pil.size
43
+ total_pixels = width * height
44
+
45
+ arr = run_text_detection(img_path)
46
+
47
+ cropped_images = crop_and_warp_regions(img_path, arr)
48
+
49
+ all_results = []
50
+ lines_final = []
51
+
52
+ for i, crop in enumerate(cropped_images):
53
+ lang_detected = detect_language_clip(crop, clip_model, clip_processor, candidates)
54
+ lang_code = language_map.get(lang_detected.lower(), "en")
55
+
56
+ ocr_model = PaddleOCR(
57
+ use_doc_orientation_classify=False,
58
+ use_doc_unwarping=False,
59
+ use_textline_orientation=False,
60
+ lang=lang_code,
61
+ det=False,
62
+ rec=True,
63
+ cls=False,
64
+ show_log=False,
65
+ use_angle_cls=False
66
+ )
67
+
68
+ result_texts = run_paddle_ocr(crop, ocr_model)
69
+ all_results.append({
70
+ "lang": lang_detected,
71
+ "texts": result_texts,
72
+ "image": crop
73
+ })
74
+
75
+ lines_final = group_text_by_position(all_results, arr)
76
+
77
+ return "\n".join(lines_final)
78
+
79
+ interface = gr.Interface(
80
+ fn=process_image,
81
+ inputs=gr.Image(type="numpy", label="Upload an Image"),
82
+ outputs=gr.Textbox(label="Reconstructed Text"),
83
+ title="Printed Text OCR",
84
+ description="Upload a scanned document or printed image. The app detects bounding boxes, extracts text, detects language, and reconstructs the text."
85
+ )
86
+
87
+ if __name__ == "__main__":
88
+ import spaces
89
+ spaces.GPU.require("A100") # Uses H100/A100 GPU on Hugging Face Spaces
90
+ interface.launch()