sbapan41 commited on
Commit
d48735f
·
verified ·
1 Parent(s): 2c94a70

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -346
app.py DELETED
@@ -1,346 +0,0 @@
1
- import os
2
- import tempfile
3
- import json
4
- import logging
5
- import time
6
- from flask import Flask, request, jsonify
7
- from werkzeug.utils import secure_filename
8
- import pdfplumber
9
- from pdf2image import convert_from_path
10
- from PIL import Image
11
- import cv2
12
- import numpy as np
13
- import io
14
- import pandas as pd
15
- try:
16
- from docx import Document
17
- except ImportError:
18
- Document = None # Handle case where python-docx is not installed
19
- import openpyxl
20
- import easyocr
21
-
22
- app = Flask(__name__)
23
-
24
- # Configure logging
25
- logging.basicConfig(
26
- level=logging.INFO,
27
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
28
- )
29
- logger = logging.getLogger(__name__)
30
-
31
- # Configuration
32
- ALLOWED_EXTENSIONS = {'pdf', 'docx', 'txt', 'csv', 'xlsx', 'xls', 'jpg', 'jpeg', 'png'}
33
- UPLOAD_FOLDER = tempfile.mkdtemp()
34
- OUTPUT_FOLDER = os.path.join(os.getcwd(), 'extracted_data')
35
- os.makedirs(OUTPUT_FOLDER, exist_ok=True)
36
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
37
- app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit
38
-
39
- # API Key Configuration
40
- API_KEYS = {
41
- "your_api_key_1": "client1",
42
- "your_api_key_2": "client2"
43
- }
44
-
45
- # Initialize EasyOCR readers with GPU support
46
- reader_en_hi = easyocr.Reader(['en', 'hi'], gpu=True)
47
- reader_en_bn = easyocr.Reader(['en', 'bn'], gpu=True)
48
- reader_en_ur = easyocr.Reader(['en', 'ur'], gpu=True)
49
-
50
- def allowed_file(filename):
51
- return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
52
-
53
- def validate_api_key():
54
- """Check if the provided API key is valid"""
55
- api_key = request.headers.get('X-API-KEY')
56
- if not api_key or api_key not in API_KEYS:
57
- return False
58
- return True
59
-
60
- def preprocess_image(image):
61
- """Enhance image for better OCR results"""
62
- try:
63
- img = np.array(image)
64
- if len(img.shape) == 2: # Grayscale
65
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
66
- elif img.shape[2] == 4: # RGBA
67
- img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
68
-
69
- # Convert to grayscale for processing
70
- gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
71
-
72
- # Apply adaptive thresholding
73
- processed = cv2.adaptiveThreshold(
74
- gray, 255,
75
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
76
- cv2.THRESH_BINARY, 11, 2
77
- )
78
-
79
- return Image.fromarray(processed)
80
- except Exception as e:
81
- logger.error(f"Image preprocessing failed: {str(e)}")
82
- return image
83
-
84
- def extract_text_from_image(image):
85
- """Extract text from image using EasyOCR"""
86
- try:
87
- processed_img = preprocess_image(image)
88
- result_en_hi = reader_en_hi.readtext(np.array(processed_img))
89
- result_en_bn = reader_en_bn.readtext(np.array(processed_img))
90
- result_en_ur = reader_en_ur.readtext(np.array(processed_img))
91
-
92
- text_en_hi = " ".join([text[1] for text in result_en_hi])
93
- text_en_bn = " ".join([text[1] for text in result_en_bn])
94
- text_en_ur = " ".join([text[1] for text in result_en_ur])
95
-
96
- return text_en_hi + " " + text_en_bn + " " + text_en_ur
97
- except Exception as e:
98
- logger.error(f"OCR extraction failed: {str(e)}")
99
- return ""
100
-
101
- def process_pdf_page(page, page_num, pdf_path):
102
- """Process a single PDF page with mixed content"""
103
- result = {
104
- "page": page_num + 1,
105
- "native_text": "",
106
- "image_text": "",
107
- "type": "mixed"
108
- }
109
-
110
- # First try to extract native text
111
- try:
112
- result["native_text"] = page.extract_text(x_tolerance=1, y_tolerance=1) or ""
113
- except Exception as e:
114
- logger.warning(f"Native text extraction failed: {str(e)}")
115
-
116
- # Check if page has images or if native text extraction was insufficient
117
- if page.images or len(result["native_text"].strip()) < 50:
118
- try:
119
- # Convert the entire page to image
120
- images = convert_from_path(
121
- pdf_path,
122
- first_page=page_num+1,
123
- last_page=page_num+1,
124
- dpi=300,
125
- size=(2480, 3508)) # A4 size at 300dpi
126
-
127
- if images:
128
- # Extract text from the full page image
129
- full_page_text = extract_text_from_image(images[0])
130
-
131
- # Only use OCR text if we got more content than native extraction
132
- if len(full_page_text) > len(result["native_text"]):
133
- result["image_text"] = full_page_text
134
- result["type"] = "ocr_text" if not result["native_text"] else "mixed"
135
-
136
- # Explicit cleanup
137
- del images
138
- except Exception as e:
139
- logger.error(f"Page image processing failed: {str(e)}")
140
-
141
- return result
142
-
143
- def process_docx(file_path):
144
- """Extract text from DOCX file"""
145
- if Document is None:
146
- raise ImportError("python-docx package is not installed")
147
-
148
- try:
149
- doc = Document(file_path)
150
- text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
151
- return {
152
- "content": [{
153
- "page": 1,
154
- "text": text,
155
- "type": "native_text"
156
- }]
157
- }
158
- except Exception as e:
159
- logger.error(f"DOCX processing failed: {str(e)}")
160
- raise
161
-
162
- def process_txt(file_path):
163
- """Extract text from TXT file"""
164
- try:
165
- with open(file_path, 'r', encoding='utf-8') as f:
166
- text = f.read()
167
- return {
168
- "content": [{
169
- "page": 1,
170
- "text": text,
171
- "type": "native_text"
172
- }]
173
- }
174
- except Exception as e:
175
- logger.error(f"TXT processing failed: {str(e)}")
176
- raise
177
-
178
- def process_csv(file_path):
179
- """Extract data from CSV file"""
180
- try:
181
- df = pd.read_csv(file_path)
182
- text = df.to_string(index=False)
183
- return {
184
- "content": [{
185
- "page": 1,
186
- "text": text,
187
- "type": "table_data"
188
- }]
189
- }
190
- except Exception as e:
191
- logger.error(f"CSV processing failed: {str(e)}")
192
- raise
193
-
194
- def process_excel(file_path):
195
- """Extract data from Excel file (XLSX or XLS)"""
196
- try:
197
- text = ""
198
- if file_path.endswith('.xlsx'):
199
- wb = openpyxl.load_workbook(file_path)
200
- for sheet_name in wb.sheetnames:
201
- sheet = wb[sheet_name]
202
- text += f"\n\nSheet: {sheet_name}\n"
203
- for row in sheet.iter_rows(values_only=True):
204
- text += "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
205
- else: # .xls
206
- df = pd.read_excel(file_path, sheet_name=None)
207
- for sheet_name, data in df.items():
208
- text += f"\n\nSheet: {sheet_name}\n{data.to_string(index=False)}\n"
209
-
210
- return {
211
- "content": [{
212
- "page": 1,
213
- "text": text,
214
- "type": "table_data"
215
- }]
216
- }
217
- except Exception as e:
218
- logger.error(f"Excel processing failed: {str(e)}")
219
- raise
220
-
221
- def process_image(file_path):
222
- """Extract text from image file (JPG, JPEG, PNG)"""
223
- try:
224
- image = Image.open(file_path)
225
- text = extract_text_from_image(image)
226
- return {
227
- "content": [{
228
- "page": 1,
229
- "text": text,
230
- "type": "ocr_text"
231
- }]
232
- }
233
- except Exception as e:
234
- logger.error(f"Image processing failed: {str(e)}")
235
- raise
236
-
237
- @app.route('/process', methods=['POST'])
238
- def handle_file():
239
- # API Key validation
240
- if not validate_api_key():
241
- return jsonify({"error": "Invalid or missing API key"}), 401
242
-
243
- if 'file' not in request.files:
244
- return jsonify({"error": "No file provided"}), 400
245
-
246
- file = request.files['file']
247
- if not file or file.filename == '':
248
- return jsonify({"error": "No selected file"}), 400
249
-
250
- if not allowed_file(file.filename):
251
- return jsonify({"error": "Invalid file type"}), 400
252
-
253
- temp_path = None
254
- try:
255
- # Save uploaded file temporarily
256
- filename = secure_filename(file.filename)
257
- temp_dir = tempfile.mkdtemp()
258
- temp_path = os.path.join(temp_dir, filename)
259
- file.save(temp_path)
260
-
261
- start_time = time.time()
262
- file_extension = filename.rsplit('.', 1)[1].lower()
263
-
264
- # Process file based on extension
265
- if file_extension == 'pdf':
266
- results = []
267
- with pdfplumber.open(temp_path) as pdf:
268
- for page_num, page in enumerate(pdf.pages):
269
- page_result = process_pdf_page(page, page_num, temp_path)
270
- results.append(page_result)
271
-
272
- # Combine results
273
- combined_text = ""
274
- for page in results:
275
- combined_text += page.get("native_text", "") + "\n" + page.get("image_text", "") + "\n"
276
-
277
- response = {
278
- "metadata": {
279
- "filename": filename,
280
- "pages": len(results),
281
- "processing_time": round(time.time() - start_time, 2),
282
- "text_length": len(combined_text)
283
- },
284
- "content": results
285
- }
286
- elif file_extension == 'docx':
287
- response = process_docx(temp_path)
288
- response['metadata'] = {
289
- "filename": filename,
290
- "pages": 1,
291
- "processing_time": round(time.time() - start_time, 2),
292
- "text_length": len(response['content'][0]['text'])
293
- }
294
- elif file_extension == 'txt':
295
- response = process_txt(temp_path)
296
- response['metadata'] = {
297
- "filename": filename,
298
- "pages": 1,
299
- "processing_time": round(time.time() - start_time, 2),
300
- "text_length": len(response['content'][0]['text'])
301
- }
302
- elif file_extension == 'csv':
303
- response = process_csv(temp_path)
304
- response['metadata'] = {
305
- "filename": filename,
306
- "pages": 1,
307
- "processing_time": round(time.time() - start_time, 2),
308
- "text_length": len(response['content'][0]['text'])
309
- }
310
- elif file_extension in ('xlsx', 'xls'):
311
- response = process_excel(temp_path)
312
- response['metadata'] = {
313
- "filename": filename,
314
- "pages": 1,
315
- "processing_time": round(time.time() - start_time, 2),
316
- "text_length": len(response['content'][0]['text'])
317
- }
318
- elif file_extension in ('jpg', 'jpeg', 'png'):
319
- response = process_image(temp_path)
320
- response['metadata'] = {
321
- "filename": filename,
322
- "pages": 1,
323
- "processing_time": round(time.time() - start_time, 2),
324
- "text_length": len(response['content'][0]['text'])
325
- }
326
- else:
327
- return jsonify({"error": "Unsupported file type"}), 400
328
-
329
- return jsonify(response)
330
-
331
- except Exception as e:
332
- logger.error(f"Processing failed: {str(e)}")
333
- return jsonify({"error": str(e)}), 500
334
-
335
- finally:
336
- # Clean up temporary files
337
- try:
338
- if temp_path and os.path.exists(temp_path):
339
- os.remove(temp_path)
340
- if 'temp_dir' in locals() and os.path.exists(temp_dir):
341
- os.rmdir(temp_dir)
342
- except Exception as e:
343
- logger.error(f"Cleanup failed: {str(e)}")
344
-
345
- if __name__ == '__main__':
346
- app.run(host='0.0.0.0', port=5000, debug=True)