GiantAnalytics commited on
Commit
af88408
·
verified ·
1 Parent(s): 7c1aefa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -53
app.py CHANGED
@@ -12,9 +12,6 @@ from pytesseract import Output
12
  import traceback
13
  import logging
14
  import sys
15
- from img2table.document import Image
16
- from img2table.ocr import TesseractOCR
17
-
18
 
19
  # Set up logging
20
  logging.basicConfig(level=logging.INFO,
@@ -58,65 +55,53 @@ def ocr_extract_text_and_tables(image):
58
  logger.warning("No image provided")
59
  return "No image provided", None, None
60
 
 
 
61
  # Convert to RGB if needed
62
  if len(image.shape) == 3 and image.shape[2] == 4: # RGBA
 
63
  image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
64
 
65
- # Convert image to grayscale for better OCR
66
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
67
-
68
- # Apply adaptive thresholding to enhance text
69
- processed = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
70
 
71
- # 1. Extract general text using EasyOCR
72
- results = reader.readtext(processed)
 
 
73
 
74
- # Prepare text output
75
  detected_text = []
76
  for i, (bbox, text, confidence) in enumerate(results):
 
77
  detected_text.append(f"{text} (Confidence: {confidence:.2f})")
78
-
79
- # 2. Use img2table for structured table extraction
80
- logger.info("Running img2table for structured table detection...")
81
- img = Image(image)
82
- ocr = TesseractOCR(lang="fra") # French language for OCR
83
-
84
- # Extract tables
85
- tables = img.extract_tables(ocr=ocr)
86
-
87
- # Convert tables to Pandas DataFrame
88
- table_data = []
89
- for table in tables:
90
- df_table = table.df
91
- table_data.append(df_table)
92
-
93
- # Save extracted tables as CSV (optional)
94
- for i, df in enumerate(table_data):
95
- df.to_csv(f"extracted_table_{i+1}.csv", index=False)
96
-
97
- # Annotate image with bounding boxes around detected text
98
- pil_image = Image.fromarray(image)
99
- draw = ImageDraw.Draw(pil_image)
100
-
101
- for (bbox, text, confidence) in results:
102
- top_left = tuple(map(int, bbox[0]))
103
- bottom_right = tuple(map(int, bbox[2]))
104
- draw.rectangle([top_left, bottom_right], outline="red", width=3)
105
-
106
- annotated_image = np.array(pil_image)
107
-
108
- # Join detected text
109
- text_output = "\n".join(detected_text)
110
-
111
- # Format tables for display
112
- tables_output = "\n\n".join([df.to_string(index=False, header=False) for df in table_data])
113
-
114
- return text_output, tables_output, annotated_image
115
-
116
- except Exception as e:
117
- error_msg = f"Error: {str(e)}"
118
- logger.error(error_msg)
119
- return error_msg, "Processing failed", None
120
 
121
  # Try to identify table structures based on alignment and spacing
122
  tables = []
 
12
  import traceback
13
  import logging
14
  import sys
 
 
 
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO,
 
55
  logger.warning("No image provided")
56
  return "No image provided", None, None
57
 
58
+ logger.info(f"Image shape: {image.shape}, dtype: {image.dtype}")
59
+
60
  # Convert to RGB if needed
61
  if len(image.shape) == 3 and image.shape[2] == 4: # RGBA
62
+ logger.info("Converting RGBA to RGB")
63
  image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
64
 
65
+ # Create copy for table detection
66
+ table_image = image.copy()
 
 
 
67
 
68
+ # 1. First extract general text with EasyOCR
69
+ logger.info("Running EasyOCR text detection...")
70
+ results = reader.readtext(image)
71
+ logger.info(f"EasyOCR detected {len(results)} text regions")
72
 
73
+ # Prepare text output and confidence scores
74
  detected_text = []
75
  for i, (bbox, text, confidence) in enumerate(results):
76
+ logger.info(f"Text region {i+1}: '{text}' with confidence {confidence:.2f}")
77
  detected_text.append(f"{text} (Confidence: {confidence:.2f})")
78
+
79
+ # 2. Use pytesseract for table detection and extraction
80
+ logger.info("Running Pytesseract for table detection...")
81
+ try:
82
+ pytesseract_config = r'--oem 3 --psm 6 -l fra' # French language
83
+ logger.info(f"Pytesseract config: {pytesseract_config}")
84
+ df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
85
+ logger.info(f"Pytesseract returned dataframe with shape: {df.shape}")
86
+ except Exception as e:
87
+ logger.error(f"Pytesseract error: {str(e)}")
88
+ logger.error(traceback.format_exc())
89
+ df = pd.DataFrame() # Empty dataframe to continue processing
90
+
91
+ # Filter out low-confidence text
92
+ try:
93
+ if not df.empty:
94
+ logger.info("Filtering low-confidence text...")
95
+ df = df.dropna(subset=['text'])
96
+ logger.info(f"After dropna, dataframe shape: {df.shape}")
97
+ if 'conf' in df.columns:
98
+ df = df.query('conf > 50')
99
+ logger.info(f"After confidence filtering, dataframe shape: {df.shape}")
100
+ else:
101
+ logger.warning("No 'conf' column found in pytesseract output")
102
+ except Exception as e:
103
+ logger.error(f"Error filtering dataframe: {str(e)}")
104
+ logger.error(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Try to identify table structures based on alignment and spacing
107
  tables = []