HunzalaRasheed1 commited on
Commit
b7daa73
·
verified ·
1 Parent(s): 9f7c7bc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +401 -0
app.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
+ import os
4
+ import io
5
+ import base64
6
+ from PIL import Image, ExifTags
7
+ import pytesseract
8
+ import cv2
9
+ import numpy as np
10
+ from datetime import datetime
11
+ import hashlib
12
+ from pdf2image import convert_from_path
13
+ import tempfile
14
+ from reportlab.pdfgen import canvas
15
+ from reportlab.lib.colors import Color
16
+ from reportlab.lib.pagesizes import letter
17
+ import fitz # PyMuPDF
18
+
19
+ app = Flask(__name__)
20
+ CORS(app)
21
+
22
+ # Configure upload settings
23
+ UPLOAD_FOLDER = 'uploads'
24
+ ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'webp', 'pdf'}
25
+ MAX_FILE_SIZE = 16 * 1024 * 1024 # 16MB
26
+
27
+ # Create uploads directory if it doesn't exist
28
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
29
+
30
+ def allowed_file(filename):
31
+ """Check if the file extension is allowed."""
32
+ return '.' in filename and \
33
+ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
34
+
35
+ def extract_text_from_image(image_path):
36
+ """Extract text from image using OCR."""
37
+ try:
38
+ # Use pytesseract to extract text
39
+ text = pytesseract.image_to_string(Image.open(image_path))
40
+
41
+ # Also get detailed data including confidence scores
42
+ data = pytesseract.image_to_data(Image.open(image_path), output_type=pytesseract.Output.DICT)
43
+
44
+ # Filter out empty text and low confidence results
45
+ filtered_text = []
46
+ for i in range(len(data['text'])):
47
+ if int(data['conf'][i]) > 30 and data['text'][i].strip():
48
+ filtered_text.append({
49
+ 'text': data['text'][i].strip(),
50
+ 'confidence': int(data['conf'][i]),
51
+ 'bbox': {
52
+ 'x': data['left'][i],
53
+ 'y': data['top'][i],
54
+ 'width': data['width'][i],
55
+ 'height': data['height'][i]
56
+ }
57
+ })
58
+
59
+ return {
60
+ 'raw_text': text.strip(),
61
+ 'detailed_text': filtered_text,
62
+ 'success': True
63
+ }
64
+ except Exception as e:
65
+ return {
66
+ 'raw_text': '',
67
+ 'detailed_text': [],
68
+ 'success': False,
69
+ 'error': str(e)
70
+ }
71
+
72
+ def extract_image_metadata(image_path):
73
+ """Extract metadata from image."""
74
+ try:
75
+ with Image.open(image_path) as img:
76
+ # Basic image info
77
+ metadata = {
78
+ 'format': img.format,
79
+ 'mode': img.mode,
80
+ 'size': {
81
+ 'width': img.width,
82
+ 'height': img.height
83
+ },
84
+ 'has_transparency': img.mode in ('RGBA', 'LA') or 'transparency' in img.info
85
+ }
86
+
87
+ # EXIF data
88
+ exif_data = {}
89
+ if hasattr(img, '_getexif') and img._getexif() is not None:
90
+ exif = img._getexif()
91
+ for tag_id, value in exif.items():
92
+ tag = ExifTags.TAGS.get(tag_id, tag_id)
93
+ exif_data[tag] = str(value)
94
+
95
+ metadata['exif'] = exif_data
96
+
97
+ # File size
98
+ metadata['file_size'] = os.path.getsize(image_path)
99
+
100
+ return metadata
101
+ except Exception as e:
102
+ return {'error': str(e)}
103
+
104
+ def analyze_colors(image_path):
105
+ """Analyze dominant colors in the image."""
106
+ try:
107
+ # Load image with OpenCV
108
+ img = cv2.imread(image_path)
109
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
110
+
111
+ # Reshape image to be a list of pixels
112
+ pixels = img_rgb.reshape(-1, 3)
113
+
114
+ # Calculate color statistics
115
+ mean_color = np.mean(pixels, axis=0).astype(int).tolist()
116
+
117
+ # Find dominant colors using k-means clustering
118
+ from sklearn.cluster import KMeans
119
+
120
+ # Use 5 clusters to find 5 dominant colors
121
+ kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
122
+ kmeans.fit(pixels)
123
+
124
+ colors = kmeans.cluster_centers_.astype(int).tolist()
125
+
126
+ # Calculate color percentages
127
+ labels = kmeans.labels_
128
+ percentages = []
129
+ total_pixels = len(labels)
130
+
131
+ for i in range(5):
132
+ percentage = (np.sum(labels == i) / total_pixels) * 100
133
+ percentages.append(round(percentage, 2))
134
+
135
+ # Combine colors with percentages
136
+ dominant_colors = [
137
+ {
138
+ 'color': {'r': color[0], 'g': color[1], 'b': color[2]},
139
+ 'hex': f"#{color[0]:02x}{color[1]:02x}{color[2]:02x}",
140
+ 'percentage': percentages[i]
141
+ }
142
+ for i, color in enumerate(colors)
143
+ ]
144
+
145
+ # Sort by percentage
146
+ dominant_colors.sort(key=lambda x: x['percentage'], reverse=True)
147
+
148
+ return {
149
+ 'mean_color': {
150
+ 'r': mean_color[0],
151
+ 'g': mean_color[1],
152
+ 'b': mean_color[2]
153
+ },
154
+ 'dominant_colors': dominant_colors
155
+ }
156
+ except Exception as e:
157
+ return {'error': str(e)}
158
+
159
+
160
+ def draw_text_boxes(image_path, text_data):
161
+ """Draw boxes around detected text regions."""
162
+ try:
163
+ # Read the image
164
+ img = cv2.imread(image_path)
165
+
166
+ # Draw boxes for each detected text region
167
+ for item in text_data['detailed_text']:
168
+ bbox = item['bbox']
169
+ # Draw rectangle
170
+ cv2.rectangle(
171
+ img,
172
+ (bbox['x'], bbox['y']),
173
+ (bbox['x'] + bbox['width'], bbox['y'] + bbox['height']),
174
+ (0, 255, 0), # Green color
175
+ 2 # Thickness
176
+ )
177
+
178
+ # Save the annotated image
179
+ annotated_path = image_path.replace('.', '_annotated.')
180
+ cv2.imwrite(annotated_path, img)
181
+ return annotated_path
182
+ except Exception as e:
183
+ print(f"Error drawing text boxes: {str(e)}")
184
+ return image_path
185
+
186
+ def extract_text_from_pdf(pdf_path):
187
+ """Extract text from PDF using OCR."""
188
+ try:
189
+ # Convert PDF to images
190
+ images = convert_from_path(pdf_path)
191
+
192
+ all_text = []
193
+ all_detailed_text = []
194
+
195
+ # Process each page
196
+ for i, image in enumerate(images):
197
+ # Save temporary image
198
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
199
+ temp_path = temp_file.name
200
+ image.save(temp_path, 'PNG')
201
+
202
+ # Extract text from the page
203
+ page_text = extract_text_from_image(temp_path)
204
+
205
+ # Add page number to the results
206
+ if page_text['success']:
207
+ all_text.append(f"--- Page {i+1} ---\n{page_text['raw_text']}")
208
+ for item in page_text['detailed_text']:
209
+ item['page'] = i + 1
210
+ all_detailed_text.append(item)
211
+
212
+ # Clean up temporary file
213
+ os.unlink(temp_path)
214
+
215
+ return {
216
+ 'raw_text': '\n\n'.join(all_text),
217
+ 'detailed_text': all_detailed_text,
218
+ 'success': True,
219
+ 'total_pages': len(images)
220
+ }
221
+ except Exception as e:
222
+ return {
223
+ 'raw_text': '',
224
+ 'detailed_text': [],
225
+ 'success': False,
226
+ 'error': str(e)
227
+ }
228
+
229
+ def create_annotated_pdf(original_pdf_path, text_data):
230
+ """Create a new PDF with highlighted text regions."""
231
+ try:
232
+ # Open the original PDF
233
+ doc = fitz.open(original_pdf_path)
234
+ output_pdf = fitz.open()
235
+
236
+ # Process each page
237
+ for page_num in range(len(doc)):
238
+ page = doc[page_num]
239
+
240
+ # Create a new page in the output PDF
241
+ output_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height)
242
+
243
+ # Copy the original page content
244
+ output_page.show_pdf_page(output_page.rect, doc, page_num)
245
+
246
+ # Get text items for this page
247
+ page_text_items = [item for item in text_data['detailed_text'] if item['page'] == page_num + 1]
248
+
249
+ # Get the page dimensions
250
+ page_width = page.rect.width
251
+ page_height = page.rect.height
252
+
253
+ # Convert PDF to image to get the dimensions Tesseract used
254
+ images = convert_from_path(original_pdf_path, first_page=page_num+1, last_page=page_num+1)
255
+ if images:
256
+ img = images[0]
257
+ img_width, img_height = img.size
258
+
259
+ # Calculate scaling factors
260
+ scale_x = page_width / img_width
261
+ scale_y = page_height / img_height
262
+
263
+ # Draw filled, semi-transparent rectangles around detected text
264
+ for item in page_text_items:
265
+ bbox = item['bbox']
266
+ # Scale coordinates to PDF space
267
+ rect = fitz.Rect(
268
+ bbox['x'] * scale_x,
269
+ bbox['y'] * scale_y,
270
+ (bbox['x'] + bbox['width']) * scale_x,
271
+ (bbox['y'] + bbox['height']) * scale_y
272
+ )
273
+
274
+ # Add a filled rectangle annotation (semi-transparent green)
275
+ annot = output_page.add_rect_annot(rect)
276
+ annot.set_colors(stroke=(0, 1, 0), fill=(0, 1, 0)) # Green
277
+ annot.set_opacity(0.25) # 25% opacity
278
+ annot.update()
279
+
280
+ # Save the annotated PDF
281
+ annotated_path = original_pdf_path.replace('.pdf', '_annotated.pdf')
282
+ output_pdf.save(annotated_path)
283
+ output_pdf.close()
284
+ doc.close()
285
+
286
+ return annotated_path
287
+ except Exception as e:
288
+ print(f"Error creating annotated PDF: {str(e)}")
289
+ return original_pdf_path
290
+
291
+ @app.route('/', methods=['GET'])
292
+ def home():
293
+ """Health check endpoint."""
294
+ return jsonify({
295
+ 'message': 'Image Processing API is running',
296
+ 'version': '1.0.0',
297
+ 'endpoints': {
298
+ 'extract': '/extract - POST - Upload image for data extraction',
299
+ 'health': '/ - GET - Health check'
300
+ }
301
+ })
302
+
303
+ @app.route('/extract', methods=['POST'])
304
+ def extract_image_data():
305
+ """Extract visual data from uploaded image or PDF."""
306
+
307
+ # Check if image file is in request
308
+ if 'image' not in request.files:
309
+ return jsonify({'error': 'No file provided'}), 400
310
+
311
+ file = request.files['image']
312
+
313
+ # Check if file is selected
314
+ if file.filename == '':
315
+ return jsonify({'error': 'No file selected'}), 400
316
+
317
+ # Check file size
318
+ file.seek(0, os.SEEK_END)
319
+ file_size = file.tell()
320
+ file.seek(0)
321
+
322
+ if file_size > MAX_FILE_SIZE:
323
+ return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400
324
+
325
+ if file and allowed_file(file.filename):
326
+ try:
327
+ # Generate unique filename
328
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
329
+ file_hash = hashlib.md5(file.read()).hexdigest()[:8]
330
+ file.seek(0) # Reset file pointer
331
+
332
+ filename = f"{timestamp}_{file_hash}_{file.filename}"
333
+ file_path = os.path.join(UPLOAD_FOLDER, filename)
334
+
335
+ # Save uploaded file
336
+ file.save(file_path)
337
+
338
+ # Extract text based on file type
339
+ if file.filename.lower().endswith('.pdf'):
340
+ text_data = extract_text_from_pdf(file_path)
341
+ # Create annotated PDF
342
+ annotated_file_path = create_annotated_pdf(file_path, text_data)
343
+ else:
344
+ text_data = extract_text_from_image(file_path)
345
+ # Draw boxes around detected text for images
346
+ annotated_file_path = draw_text_boxes(file_path, text_data)
347
+
348
+ # Extract metadata
349
+ metadata = extract_image_metadata(file_path)
350
+
351
+ # Convert annotated file to base64
352
+ with open(annotated_file_path, "rb") as f:
353
+ file_base64 = base64.b64encode(f.read()).decode('utf-8')
354
+
355
+ # Clean up - remove uploaded files
356
+ os.remove(file_path)
357
+ if annotated_file_path != file_path: # Only remove if it's a different file
358
+ os.remove(annotated_file_path)
359
+
360
+ # Prepare response
361
+ response_data = {
362
+ 'success': True,
363
+ 'timestamp': datetime.now().isoformat(),
364
+ 'original_filename': file.filename,
365
+ 'file_size': file_size,
366
+ 'extracted_text': text_data,
367
+ 'metadata': metadata,
368
+ 'annotated_file_base64': file_base64
369
+ }
370
+
371
+ return jsonify(response_data)
372
+
373
+ except Exception as e:
374
+ # Clean up files if they exist
375
+ if 'file_path' in locals() and os.path.exists(file_path):
376
+ os.remove(file_path)
377
+ if 'annotated_file_path' in locals() and os.path.exists(annotated_file_path) and annotated_file_path != file_path:
378
+ os.remove(annotated_file_path)
379
+
380
+ return jsonify({
381
+ 'success': False,
382
+ 'error': f'Error processing file: {str(e)}'
383
+ }), 500
384
+
385
+ else:
386
+ return jsonify({
387
+ 'error': f'File type not allowed. Allowed types: {", ".join(ALLOWED_EXTENSIONS)}'
388
+ }), 400
389
+
390
+ @app.errorhandler(413)
391
+ def too_large(e):
392
+ return jsonify({'error': 'File too large'}), 413
393
+
394
+ @app.errorhandler(500)
395
+ def internal_error(e):
396
+ return jsonify({'error': 'Internal server error'}), 500
397
+
398
+ if __name__ == '__main__':
399
+ # Get port from environment variable or default to 7860 (Hugging Face Spaces default)
400
+ port = int(os.environ.get('PORT', 7860))
401
+ app.run(debug=False, host='0.0.0.0', port=port)