Spaces:
Sleeping
Sleeping
| from flask import Flask, request, jsonify | |
| from flask_cors import CORS | |
| import os | |
| import io | |
| import base64 | |
| from PIL import Image, ExifTags | |
| import pytesseract | |
| import cv2 | |
| import numpy as np | |
| from datetime import datetime | |
| import hashlib | |
| from pdf2image import convert_from_path | |
| import tempfile | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.colors import Color | |
| from reportlab.lib.pagesizes import letter | |
| import fitz # PyMuPDF | |
| app = Flask(__name__) | |
| CORS(app) | |
| # Configure upload settings | |
| UPLOAD_FOLDER = '/tmp/uploads' | |
| ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'webp', 'pdf'} | |
| MAX_FILE_SIZE = 16 * 1024 * 1024 # 16MB | |
| # Create uploads directory if it doesn't exist | |
| os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
| def allowed_file(filename): | |
| """Check if the file extension is allowed.""" | |
| return '.' in filename and \ | |
| filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
| def extract_text_from_image(image_path): | |
| """Extract text from image using OCR.""" | |
| try: | |
| # Use pytesseract to extract text | |
| text = pytesseract.image_to_string(Image.open(image_path)) | |
| # Also get detailed data including confidence scores | |
| data = pytesseract.image_to_data(Image.open(image_path), output_type=pytesseract.Output.DICT) | |
| # Filter out empty text and low confidence results | |
| filtered_text = [] | |
| for i in range(len(data['text'])): | |
| if int(data['conf'][i]) > 30 and data['text'][i].strip(): | |
| filtered_text.append({ | |
| 'text': data['text'][i].strip(), | |
| 'confidence': int(data['conf'][i]), | |
| 'bbox': { | |
| 'x': data['left'][i], | |
| 'y': data['top'][i], | |
| 'width': data['width'][i], | |
| 'height': data['height'][i] | |
| } | |
| }) | |
| return { | |
| 'raw_text': text.strip(), | |
| 'detailed_text': filtered_text, | |
| 'success': True | |
| } | |
| except Exception as e: | |
| return { | |
| 'raw_text': '', | |
| 'detailed_text': [], | |
| 'success': False, | |
| 'error': str(e) | |
| } | |
| def extract_image_metadata(image_path): | |
| """Extract metadata from image.""" | |
| try: | |
| with Image.open(image_path) as img: | |
| # Basic image info | |
| metadata = { | |
| 'format': img.format, | |
| 'mode': img.mode, | |
| 'size': { | |
| 'width': img.width, | |
| 'height': img.height | |
| }, | |
| 'has_transparency': img.mode in ('RGBA', 'LA') or 'transparency' in img.info | |
| } | |
| # EXIF data | |
| exif_data = {} | |
| if hasattr(img, '_getexif') and img._getexif() is not None: | |
| exif = img._getexif() | |
| for tag_id, value in exif.items(): | |
| tag = ExifTags.TAGS.get(tag_id, tag_id) | |
| exif_data[tag] = str(value) | |
| metadata['exif'] = exif_data | |
| # File size | |
| metadata['file_size'] = os.path.getsize(image_path) | |
| return metadata | |
| except Exception as e: | |
| return {'error': str(e)} | |
| """Analyze dominant colors in the image.""" | |
| try: | |
| # Load image with OpenCV | |
| img = cv2.imread(image_path) | |
| img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| # Reshape image to be a list of pixels | |
| pixels = img_rgb.reshape(-1, 3) | |
| # Calculate color statistics | |
| mean_color = np.mean(pixels, axis=0).astype(int).tolist() | |
| # Find dominant colors using k-means clustering | |
| from sklearn.cluster import KMeans | |
| # Use 5 clusters to find 5 dominant colors | |
| kmeans = KMeans(n_clusters=5, random_state=42, n_init=10) | |
| kmeans.fit(pixels) | |
| colors = kmeans.cluster_centers_.astype(int).tolist() | |
| # Calculate color percentages | |
| labels = kmeans.labels_ | |
| percentages = [] | |
| total_pixels = len(labels) | |
| for i in range(5): | |
| percentage = (np.sum(labels == i) / total_pixels) * 100 | |
| percentages.append(round(percentage, 2)) | |
| # Combine colors with percentages | |
| dominant_colors = [ | |
| { | |
| 'color': {'r': color[0], 'g': color[1], 'b': color[2]}, | |
| 'hex': f"#{color[0]:02x}{color[1]:02x}{color[2]:02x}", | |
| 'percentage': percentages[i] | |
| } | |
| for i, color in enumerate(colors) | |
| ] | |
| # Sort by percentage | |
| dominant_colors.sort(key=lambda x: x['percentage'], reverse=True) | |
| return { | |
| 'mean_color': { | |
| 'r': mean_color[0], | |
| 'g': mean_color[1], | |
| 'b': mean_color[2] | |
| }, | |
| 'dominant_colors': dominant_colors | |
| } | |
| except Exception as e: | |
| return {'error': str(e)} | |
| def draw_text_boxes(image_path, text_data): | |
| """Draw boxes around detected text regions.""" | |
| try: | |
| # Read the image | |
| img = cv2.imread(image_path) | |
| # Draw boxes for each detected text region | |
| for item in text_data['detailed_text']: | |
| bbox = item['bbox'] | |
| # Draw rectangle | |
| cv2.rectangle( | |
| img, | |
| (bbox['x'], bbox['y']), | |
| (bbox['x'] + bbox['width'], bbox['y'] + bbox['height']), | |
| (0, 255, 0), # Green color | |
| 2 # Thickness | |
| ) | |
| # Save the annotated image | |
| annotated_path = image_path.replace('.', '_annotated.') | |
| cv2.imwrite(annotated_path, img) | |
| return annotated_path | |
| except Exception as e: | |
| print(f"Error drawing text boxes: {str(e)}") | |
| return image_path | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract text from PDF using OCR.""" | |
| try: | |
| # Convert PDF to images | |
| images = convert_from_path(pdf_path) | |
| all_text = [] | |
| all_detailed_text = [] | |
| # Process each page | |
| for i, image in enumerate(images): | |
| # Save temporary image | |
| with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file: | |
| temp_path = temp_file.name | |
| image.save(temp_path, 'PNG') | |
| # Extract text from the page | |
| page_text = extract_text_from_image(temp_path) | |
| # Add page number to the results | |
| if page_text['success']: | |
| all_text.append(f"--- Page {i+1} ---\n{page_text['raw_text']}") | |
| for item in page_text['detailed_text']: | |
| item['page'] = i + 1 | |
| all_detailed_text.append(item) | |
| # Clean up temporary file | |
| os.unlink(temp_path) | |
| return { | |
| 'raw_text': '\n\n'.join(all_text), | |
| 'detailed_text': all_detailed_text, | |
| 'success': True, | |
| 'total_pages': len(images) | |
| } | |
| except Exception as e: | |
| return { | |
| 'raw_text': '', | |
| 'detailed_text': [], | |
| 'success': False, | |
| 'error': str(e) | |
| } | |
| def create_annotated_pdf(original_pdf_path, text_data): | |
| """Create a new PDF with highlighted text regions.""" | |
| try: | |
| # Open the original PDF | |
| doc = fitz.open(original_pdf_path) | |
| output_pdf = fitz.open() | |
| # Process each page | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| # Create a new page in the output PDF | |
| output_page = output_pdf.new_page(width=page.rect.width, height=page.rect.height) | |
| # Copy the original page content | |
| output_page.show_pdf_page(output_page.rect, doc, page_num) | |
| # Get text items for this page | |
| page_text_items = [item for item in text_data['detailed_text'] if item['page'] == page_num + 1] | |
| # Get the page dimensions | |
| page_width = page.rect.width | |
| page_height = page.rect.height | |
| # Convert PDF to image to get the dimensions Tesseract used | |
| images = convert_from_path(original_pdf_path, first_page=page_num+1, last_page=page_num+1) | |
| if images: | |
| img = images[0] | |
| img_width, img_height = img.size | |
| # Calculate scaling factors | |
| scale_x = page_width / img_width | |
| scale_y = page_height / img_height | |
| # Draw filled, semi-transparent rectangles around detected text | |
| for item in page_text_items: | |
| bbox = item['bbox'] | |
| # Scale coordinates to PDF space | |
| rect = fitz.Rect( | |
| bbox['x'] * scale_x, | |
| bbox['y'] * scale_y, | |
| (bbox['x'] + bbox['width']) * scale_x, | |
| (bbox['y'] + bbox['height']) * scale_y | |
| ) | |
| # Add a filled rectangle annotation (semi-transparent green) | |
| annot = output_page.add_rect_annot(rect) | |
| annot.set_colors(stroke=(0, 1, 0), fill=(0, 1, 0)) # Green | |
| annot.set_opacity(0.25) # 25% opacity | |
| annot.update() | |
| # Save the annotated PDF | |
| annotated_path = original_pdf_path.replace('.pdf', '_annotated.pdf') | |
| output_pdf.save(annotated_path) | |
| output_pdf.close() | |
| doc.close() | |
| return annotated_path | |
| except Exception as e: | |
| print(f"Error creating annotated PDF: {str(e)}") | |
| return original_pdf_path | |
| def home(): | |
| """Health check endpoint.""" | |
| return jsonify({ | |
| 'message': 'Image Processing API is running', | |
| 'version': '1.0.0', | |
| 'endpoints': { | |
| 'extract': '/extract - POST - Upload image for data extraction', | |
| 'health': '/ - GET - Health check' | |
| } | |
| }) | |
| def extract_image_data(): | |
| """Extract visual data from uploaded image or PDF.""" | |
| # Check if image file is in request | |
| if 'image' not in request.files: | |
| return jsonify({'error': 'No file provided'}), 400 | |
| file = request.files['image'] | |
| # Check if file is selected | |
| if file.filename == '': | |
| return jsonify({'error': 'No file selected'}), 400 | |
| # Check file size | |
| file.seek(0, os.SEEK_END) | |
| file_size = file.tell() | |
| file.seek(0) | |
| if file_size > MAX_FILE_SIZE: | |
| return jsonify({'error': f'File too large. Maximum size is {MAX_FILE_SIZE // (1024*1024)}MB'}), 400 | |
| if file and allowed_file(file.filename): | |
| try: | |
| # Generate unique filename | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| file_hash = hashlib.md5(file.read()).hexdigest()[:8] | |
| file.seek(0) # Reset file pointer | |
| filename = f"{timestamp}_{file_hash}_{file.filename}" | |
| file_path = os.path.join(UPLOAD_FOLDER, filename) | |
| # Save uploaded file | |
| file.save(file_path) | |
| # Extract text based on file type | |
| if file.filename.lower().endswith('.pdf'): | |
| text_data = extract_text_from_pdf(file_path) | |
| # Create annotated PDF | |
| annotated_file_path = create_annotated_pdf(file_path, text_data) | |
| else: | |
| text_data = extract_text_from_image(file_path) | |
| # Draw boxes around detected text for images | |
| annotated_file_path = draw_text_boxes(file_path, text_data) | |
| # Extract metadata | |
| metadata = extract_image_metadata(file_path) | |
| # Convert annotated file to base64 | |
| with open(annotated_file_path, "rb") as f: | |
| file_base64 = base64.b64encode(f.read()).decode('utf-8') | |
| # Clean up - remove uploaded files | |
| os.remove(file_path) | |
| if annotated_file_path != file_path: # Only remove if it's a different file | |
| os.remove(annotated_file_path) | |
| # Prepare response | |
| response_data = { | |
| 'success': True, | |
| 'timestamp': datetime.now().isoformat(), | |
| 'original_filename': file.filename, | |
| 'file_size': file_size, | |
| 'extracted_text': text_data, | |
| 'metadata': metadata, | |
| 'annotated_file_base64': file_base64 | |
| } | |
| return jsonify(response_data) | |
| except Exception as e: | |
| # Clean up files if they exist | |
| if 'file_path' in locals() and os.path.exists(file_path): | |
| os.remove(file_path) | |
| if 'annotated_file_path' in locals() and os.path.exists(annotated_file_path) and annotated_file_path != file_path: | |
| os.remove(annotated_file_path) | |
| return jsonify({ | |
| 'success': False, | |
| 'error': f'Error processing file: {str(e)}' | |
| }), 500 | |
| else: | |
| return jsonify({ | |
| 'error': f'File type not allowed. Allowed types: {", ".join(ALLOWED_EXTENSIONS)}' | |
| }), 400 | |
| def too_large(e): | |
| return jsonify({'error': 'File too large'}), 413 | |
| def internal_error(e): | |
| return jsonify({'error': 'Internal server error'}), 500 | |
| if __name__ == '__main__': | |
| port = int(7860) | |
| app.run(debug=False, host='0.0.0.0', port=port) | |