Spaces:

GiantAnalytics
/

ArabicOCRExtractor

Running

App Files Files Community

GiantAnalytics commited on Mar 21, 2025

Commit

af88408

verified ·

1 Parent(s): 7c1aefa

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -53

app.py CHANGED Viewed

@@ -12,9 +12,6 @@ from pytesseract import Output
 import traceback
 import logging
 import sys
-from img2table.document import Image
-from img2table.ocr import TesseractOCR
 # Set up logging
 logging.basicConfig(level=logging.INFO,
@@ -58,65 +55,53 @@ def ocr_extract_text_and_tables(image):
             logger.warning("No image provided")
             return "No image provided", None, None
         # Convert to RGB if needed
         if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA
             image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
-        # Convert image to grayscale for better OCR
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        # Apply adaptive thresholding to enhance text
-        processed = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
-        # 1. Extract general text using EasyOCR
-        results = reader.readtext(processed)
-        # Prepare text output
         detected_text = []
         for i, (bbox, text, confidence) in enumerate(results):
             detected_text.append(f"{text} (Confidence: {confidence:.2f})")
-        # 2. Use img2table for structured table extraction
-        logger.info("Running img2table for structured table detection...")
-        img = Image(image)
-        ocr = TesseractOCR(lang="fra")  # French language for OCR
-        # Extract tables
-        tables = img.extract_tables(ocr=ocr)
-        # Convert tables to Pandas DataFrame
-        table_data = []
-        for table in tables:
-            df_table = table.df
-            table_data.append(df_table)
-        # Save extracted tables as CSV (optional)
-        for i, df in enumerate(table_data):
-            df.to_csv(f"extracted_table_{i+1}.csv", index=False)
-        # Annotate image with bounding boxes around detected text
-        pil_image = Image.fromarray(image)
-        draw = ImageDraw.Draw(pil_image)
-        for (bbox, text, confidence) in results:
-            top_left = tuple(map(int, bbox[0]))
-            bottom_right = tuple(map(int, bbox[2]))
-            draw.rectangle([top_left, bottom_right], outline="red", width=3)
-        annotated_image = np.array(pil_image)
-        # Join detected text
-        text_output = "\n".join(detected_text)
-        # Format tables for display
-        tables_output = "\n\n".join([df.to_string(index=False, header=False) for df in table_data])
-        return text_output, tables_output, annotated_image
-    except Exception as e:
-        error_msg = f"Error: {str(e)}"
-        logger.error(error_msg)
-        return error_msg, "Processing failed", None
         # Try to identify table structures based on alignment and spacing
         tables = []

 import traceback
 import logging
 import sys
 # Set up logging
 logging.basicConfig(level=logging.INFO,
             logger.warning("No image provided")
             return "No image provided", None, None
+        logger.info(f"Image shape: {image.shape}, dtype: {image.dtype}")
         # Convert to RGB if needed
         if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA
+            logger.info("Converting RGBA to RGB")
             image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+        # Create copy for table detection
+        table_image = image.copy()
+        # 1. First extract general text with EasyOCR
+        logger.info("Running EasyOCR text detection...")
+        results = reader.readtext(image)
+        logger.info(f"EasyOCR detected {len(results)} text regions")
+        # Prepare text output and confidence scores
         detected_text = []
         for i, (bbox, text, confidence) in enumerate(results):
+            logger.info(f"Text region {i+1}: '{text}' with confidence {confidence:.2f}")
             detected_text.append(f"{text} (Confidence: {confidence:.2f})")
+        # 2. Use pytesseract for table detection and extraction
+        logger.info("Running Pytesseract for table detection...")
+        try:
+            pytesseract_config = r'--oem 3 --psm 6 -l fra'  # French language
+            logger.info(f"Pytesseract config: {pytesseract_config}")
+            df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
+            logger.info(f"Pytesseract returned dataframe with shape: {df.shape}")
+        except Exception as e:
+            logger.error(f"Pytesseract error: {str(e)}")
+            logger.error(traceback.format_exc())
+            df = pd.DataFrame()  # Empty dataframe to continue processing
+        # Filter out low-confidence text
+        try:
+            if not df.empty:
+                logger.info("Filtering low-confidence text...")
+                df = df.dropna(subset=['text'])
+                logger.info(f"After dropna, dataframe shape: {df.shape}")
+                if 'conf' in df.columns:
+                    df = df.query('conf > 50')
+                    logger.info(f"After confidence filtering, dataframe shape: {df.shape}")
+                else:
+                    logger.warning("No 'conf' column found in pytesseract output")
+        except Exception as e:
+            logger.error(f"Error filtering dataframe: {str(e)}")
+            logger.error(traceback.format_exc())
         # Try to identify table structures based on alignment and spacing
         tables = []