GiantAnalytics commited on
Commit
0e09218
·
verified ·
1 Parent(s): a7b5b52

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import easyocr
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image, ImageDraw, ImageFont
6
+ import os
7
+ import requests
8
+ from pathlib import Path
9
+ import pandas as pd
10
+ import pytesseract
11
+ from pytesseract import Output
12
+
13
+ # Download and cache the font file
14
+ def get_font():
15
+ font_path = Path("Roboto-Regular.ttf")
16
+ if not font_path.exists():
17
+ font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
18
+ response = requests.get(font_url)
19
+ font_path.write_bytes(response.content)
20
+ return str(font_path)
21
+
22
+ # Initialize EasyOCR Reader for French
23
+ reader = easyocr.Reader(['fr'], gpu=True) # Set gpu=False if no GPU available
24
+
25
+ def ocr_extract_text_and_tables(image):
26
+ if image is None:
27
+ return "No image provided", None, None
28
+
29
+ # Convert to RGB if needed
30
+ if len(image.shape) == 3 and image.shape[2] == 4: # RGBA
31
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
32
+
33
+ # Create copy for table detection
34
+ table_image = image.copy()
35
+
36
+ # 1. First extract general text with EasyOCR
37
+ results = reader.readtext(image)
38
+
39
+ # Prepare text output and confidence scores
40
+ detected_text = []
41
+ for (_, text, confidence) in results:
42
+ detected_text.append(f"{text} (Confidence: {confidence:.2f})")
43
+
44
+ # 2. Use pytesseract for table detection and extraction
45
+ # This approach uses pytesseract's data.frame output to identify potential tables
46
+ pytesseract_config = r'--oem 3 --psm 6 -l fra' # French language
47
+ df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
48
+
49
+ # Filter out low-confidence text
50
+ df = df.dropna(subset=['text']).query('conf > 50')
51
+
52
+ # Try to identify table structures based on alignment and spacing
53
+ tables = []
54
+
55
+ # Simple table detection: look for text that's aligned in columns with similar x-coordinates
56
+ # Group by block_num which often separates tables
57
+ blocks = df['block_num'].unique()
58
+
59
+ for block in blocks:
60
+ block_df = df[df['block_num'] == block]
61
+ if len(block_df) > 4: # Assuming a table has at least a few cells
62
+ # Sort by top-to-bottom (vertical position)
63
+ sorted_df = block_df.sort_values(['top', 'left'])
64
+
65
+ # Convert to pandas table format
66
+ table_rows = []
67
+ current_row = []
68
+ last_top = -100
69
+
70
+ for _, row in sorted_df.iterrows():
71
+ # If we're on a new row (based on vertical position)
72
+ if abs(row['top'] - last_top) > 10: # Threshold for new row
73
+ if current_row:
74
+ table_rows.append(current_row)
75
+ current_row = []
76
+ last_top = row['top']
77
+
78
+ current_row.append(row['text'])
79
+
80
+ # Add the last row
81
+ if current_row:
82
+ table_rows.append(current_row)
83
+
84
+ # If we have multiple rows, we might have a table
85
+ if len(table_rows) > 1:
86
+ # Try to create a pandas DataFrame
87
+ try:
88
+ # Pad rows to have equal length
89
+ max_cols = max(len(row) for row in table_rows)
90
+ padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
91
+
92
+ # Create DataFrame
93
+ table_df = pd.DataFrame(padded_rows)
94
+ # Add to tables list
95
+ tables.append(table_df)
96
+ except:
97
+ pass
98
+
99
+ # Create annotated image
100
+ pil_image = Image.fromarray(image)
101
+ draw = ImageDraw.Draw(pil_image)
102
+
103
+ # Get font for annotation
104
+ try:
105
+ font = ImageFont.truetype(get_font(), size=20)
106
+ except Exception as e:
107
+ print(f"Error loading font: {e}")
108
+ font = ImageFont.load_default()
109
+
110
+ # Draw boxes and text for regular text detection
111
+ for (bbox, text, confidence) in results:
112
+ # Convert points to integers
113
+ top_left = tuple(map(int, bbox[0]))
114
+ bottom_right = tuple(map(int, bbox[2]))
115
+
116
+ # Draw rectangle
117
+ draw.rectangle([top_left, bottom_right], outline="red", width=3)
118
+
119
+ # Draw text with confidence
120
+ text_with_conf = f"{text} ({confidence:.2f})"
121
+ draw.text(top_left, text_with_conf, fill="blue", font=font)
122
+
123
+ # Convert back to numpy array
124
+ annotated_image = np.array(pil_image)
125
+
126
+ # Join detected text with proper formatting
127
+ text_output = "\n".join(detected_text)
128
+
129
+ # Format tables for display
130
+ tables_output = ""
131
+ for i, table in enumerate(tables):
132
+ tables_output += f"Table {i+1}:\n"
133
+ tables_output += table.to_string(index=False, header=False) + "\n\n"
134
+
135
+ return text_output, tables_output, annotated_image
136
+
137
+ # Create Gradio interface
138
+ iface = gr.Interface(
139
+ fn=ocr_extract_text_and_tables,
140
+ inputs=gr.Image(type="numpy", label="Upload Image"),
141
+ outputs=[
142
+ gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
143
+ gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
144
+ gr.Image(label="Annotated Image")
145
+ ],
146
+ title="French OCR & Table Extractor",
147
+ description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
148
+ examples=[], # You can add example images here
149
+ cache_examples=True
150
+ )
151
+
152
+ # Launch the interface
153
+ if __name__ == "__main__":
154
+ iface.launch()