Spaces:

GiantAnalytics
/

ArabicOCRExtractor

Running

App Files Files Community

GiantAnalytics commited on Mar 21, 2025

Commit

fe90fd9

verified ·

1 Parent(s): b828227

Update app.py

Browse files

Files changed (1) hide show

app.py +237 -125

app.py CHANGED Viewed

@@ -9,146 +9,258 @@ from pathlib import Path
 import pandas as pd
 import pytesseract
 from pytesseract import Output
 # Download and cache the font file
 def get_font():
-    font_path = Path("Roboto-Regular.ttf")
-    if not font_path.exists():
-        font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
-        response = requests.get(font_url)
-        font_path.write_bytes(response.content)
-    return str(font_path)
 # Initialize EasyOCR Reader for French
-reader = easyocr.Reader(['fr'], gpu=True)  # Set gpu=False if no GPU available
 def ocr_extract_text_and_tables(image):
-    if image is None:
-        return "No image provided", None, None
-    # Convert to RGB if needed
-    if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA
-        image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
-    # Create copy for table detection
-    table_image = image.copy()
-    # 1. First extract general text with EasyOCR
-    results = reader.readtext(image)
-    # Prepare text output and confidence scores
-    detected_text = []
-    for (_, text, confidence) in results:
-        detected_text.append(f"{text} (Confidence: {confidence:.2f})")
-    # 2. Use pytesseract for table detection and extraction
-    # This approach uses pytesseract's data.frame output to identify potential tables
-    pytesseract_config = r'--oem 3 --psm 6 -l fra'  # French language
-    df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
-    # Filter out low-confidence text
-    df = df.dropna(subset=['text']).query('conf > 50')
-    # Try to identify table structures based on alignment and spacing
-    tables = []
-    # Simple table detection: look for text that's aligned in columns with similar x-coordinates
-    # Group by block_num which often separates tables
-    blocks = df['block_num'].unique()
-    for block in blocks:
-        block_df = df[df['block_num'] == block]
-        if len(block_df) > 4:  # Assuming a table has at least a few cells
-            # Sort by top-to-bottom (vertical position)
-            sorted_df = block_df.sort_values(['top', 'left'])
-            # Convert to pandas table format
-            table_rows = []
-            current_row = []
-            last_top = -100
-            for _, row in sorted_df.iterrows():
-                # If we're on a new row (based on vertical position)
-                if abs(row['top'] - last_top) > 10:  # Threshold for new row
-                    if current_row:
-                        table_rows.append(current_row)
-                        current_row = []
-                    last_top = row['top']
-                current_row.append(row['text'])
-            # Add the last row
-            if current_row:
-                table_rows.append(current_row)
-            # If we have multiple rows, we might have a table
-            if len(table_rows) > 1:
-                # Try to create a pandas DataFrame
                 try:
-                    # Pad rows to have equal length
-                    max_cols = max(len(row) for row in table_rows)
-                    padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
-                    # Create DataFrame
-                    table_df = pd.DataFrame(padded_rows)
-                    # Add to tables list
-                    tables.append(table_df)
-                except:
-                    pass
-    # Create annotated image
-    pil_image = Image.fromarray(image)
-    draw = ImageDraw.Draw(pil_image)
-    # Get font for annotation
-    try:
-        font = ImageFont.truetype(get_font(), size=20)
     except Exception as e:
-        print(f"Error loading font: {e}")
-        font = ImageFont.load_default()
-    # Draw boxes and text for regular text detection
-    for (bbox, text, confidence) in results:
-        # Convert points to integers
-        top_left = tuple(map(int, bbox[0]))
-        bottom_right = tuple(map(int, bbox[2]))
-        # Draw rectangle
-        draw.rectangle([top_left, bottom_right], outline="red", width=3)
-        # Draw text with confidence
-        text_with_conf = f"{text} ({confidence:.2f})"
-        draw.text(top_left, text_with_conf, fill="blue", font=font)
-    # Convert back to numpy array
-    annotated_image = np.array(pil_image)
-    # Join detected text with proper formatting
-    text_output = "\n".join(detected_text)
-    # Format tables for display
-    tables_output = ""
-    for i, table in enumerate(tables):
-        tables_output += f"Table {i+1}:\n"
-        tables_output += table.to_string(index=False, header=False) + "\n\n"
-    return text_output, tables_output, annotated_image
 # Create Gradio interface
-iface = gr.Interface(
-    fn=ocr_extract_text_and_tables,
-    inputs=gr.Image(type="numpy", label="Upload Image"),
-    outputs=[
-        gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
-        gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
-        gr.Image(label="Annotated Image")
-    ],
-    title="French OCR & Table Extractor",
-    description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
-    examples=[],  # You can add example images here
-    cache_examples=True
-)
 # Launch the interface
 if __name__ == "__main__":
-    iface.launch()

 import pandas as pd
 import pytesseract
 from pytesseract import Output
+import traceback
+import logging
+import sys
+# Set up logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s',
+                    handlers=[logging.StreamHandler(sys.stdout)])
+logger = logging.getLogger(__name__)
 # Download and cache the font file
 def get_font():
+    try:
+        logger.info("Attempting to get font...")
+        font_path = Path("Roboto-Regular.ttf")
+        if not font_path.exists():
+            logger.info("Font not found, downloading...")
+            font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
+            response = requests.get(font_url)
+            font_path.write_bytes(response.content)
+            logger.info("Font downloaded successfully")
+        else:
+            logger.info("Font already exists")
+        return str(font_path)
+    except Exception as e:
+        logger.error(f"Error in get_font: {str(e)}")
+        logger.error(traceback.format_exc())
+        return None
 # Initialize EasyOCR Reader for French
+try:
+    logger.info("Initializing EasyOCR Reader for French...")
+    reader = easyocr.Reader(['fr'], gpu=False)  # Changed to False since you're on CPU
+    logger.info("EasyOCR Reader initialized successfully")
+except Exception as e:
+    logger.error(f"Error initializing EasyOCR: {str(e)}")
+    logger.error(traceback.format_exc())
 def ocr_extract_text_and_tables(image):
+    try:
+        logger.info("Starting OCR extraction...")
+        if image is None:
+            logger.warning("No image provided")
+            return "No image provided", None, None
+        logger.info(f"Image shape: {image.shape}, dtype: {image.dtype}")
+        # Convert to RGB if needed
+        if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA
+            logger.info("Converting RGBA to RGB")
+            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+        # Create copy for table detection
+        table_image = image.copy()
+        # 1. First extract general text with EasyOCR
+        logger.info("Running EasyOCR text detection...")
+        results = reader.readtext(image)
+        logger.info(f"EasyOCR detected {len(results)} text regions")
+        # Prepare text output and confidence scores
+        detected_text = []
+        for i, (bbox, text, confidence) in enumerate(results):
+            logger.info(f"Text region {i+1}: '{text}' with confidence {confidence:.2f}")
+            detected_text.append(f"{text} (Confidence: {confidence:.2f})")
+        # 2. Use pytesseract for table detection and extraction
+        logger.info("Running Pytesseract for table detection...")
+        try:
+            pytesseract_config = r'--oem 3 --psm 6 -l fra'  # French language
+            logger.info(f"Pytesseract config: {pytesseract_config}")
+            df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
+            logger.info(f"Pytesseract returned dataframe with shape: {df.shape}")
+        except Exception as e:
+            logger.error(f"Pytesseract error: {str(e)}")
+            logger.error(traceback.format_exc())
+            df = pd.DataFrame()  # Empty dataframe to continue processing
+        # Filter out low-confidence text
+        try:
+            if not df.empty:
+                logger.info("Filtering low-confidence text...")
+                df = df.dropna(subset=['text'])
+                logger.info(f"After dropna, dataframe shape: {df.shape}")
+                if 'conf' in df.columns:
+                    df = df.query('conf > 50')
+                    logger.info(f"After confidence filtering, dataframe shape: {df.shape}")
+                else:
+                    logger.warning("No 'conf' column found in pytesseract output")
+        except Exception as e:
+            logger.error(f"Error filtering dataframe: {str(e)}")
+            logger.error(traceback.format_exc())
+        # Try to identify table structures based on alignment and spacing
+        tables = []
+        try:
+            if not df.empty and 'block_num' in df.columns:
+                logger.info("Attempting to identify tables...")
+                # Simple table detection: look for text that's aligned in columns with similar x-coordinates
+                # Group by block_num which often separates tables
+                blocks = df['block_num'].unique()
+                logger.info(f"Found {len(blocks)} text blocks")
+                for block in blocks:
+                    logger.info(f"Processing block {block}")
+                    block_df = df[df['block_num'] == block]
+                    if len(block_df) > 4:  # Assuming a table has at least a few cells
+                        logger.info(f"Block {block} has {len(block_df)} cells, might be a table")
+                        # Sort by top-to-bottom (vertical position)
+                        sorted_df = block_df.sort_values(['top', 'left'])
+                        # Convert to pandas table format
+                        table_rows = []
+                        current_row = []
+                        last_top = -100
+                        for _, row in sorted_df.iterrows():
+                            # If we're on a new row (based on vertical position)
+                            if abs(row['top'] - last_top) > 10:  # Threshold for new row
+                                if current_row:
+                                    table_rows.append(current_row)
+                                    current_row = []
+                                last_top = row['top']
+                            current_row.append(row['text'])
+                        # Add the last row
+                        if current_row:
+                            table_rows.append(current_row)
+                        logger.info(f"Extracted {len(table_rows)} rows from potential table")
+                        # If we have multiple rows, we might have a table
+                        if len(table_rows) > 1:
+                            # Try to create a pandas DataFrame
+                            try:
+                                # Pad rows to have equal length
+                                max_cols = max(len(row) for row in table_rows)
+                                logger.info(f"Table has {max_cols} columns")
+                                padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
+                                # Create DataFrame
+                                table_df = pd.DataFrame(padded_rows)
+                                # Add to tables list
+                                tables.append(table_df)
+                                logger.info(f"Successfully created table with shape {table_df.shape}")
+                            except Exception as e:
+                                logger.error(f"Error creating table DataFrame: {str(e)}")
+                                logger.error(traceback.format_exc())
+        except Exception as e:
+            logger.error(f"Error in table detection: {str(e)}")
+            logger.error(traceback.format_exc())
+        logger.info(f"Detected {len(tables)} tables")
+        # Create annotated image
+        try:
+            logger.info("Creating annotated image...")
+            pil_image = Image.fromarray(image)
+            draw = ImageDraw.Draw(pil_image)
+            # Get font for annotation
+            logger.info("Loading font...")
+            try:
+                font_path = get_font()
+                if font_path:
+                    font = ImageFont.truetype(font_path, size=20)
+                    logger.info("Font loaded successfully")
+                else:
+                    logger.warning("Font path is None, using default font")
+                    font = ImageFont.load_default()
+            except Exception as e:
+                logger.error(f"Error loading font: {str(e)}")
+                logger.error(traceback.format_exc())
+                font = ImageFont.load_default()
+                logger.info("Using default font instead")
+            # Draw boxes and text for regular text detection
+            logger.info("Drawing annotation boxes...")
+            for i, (bbox, text, confidence) in enumerate(results):
                 try:
+                    # Convert points to integers
+                    top_left = tuple(map(int, bbox[0]))
+                    bottom_right = tuple(map(int, bbox[2]))
+                    # Draw rectangle
+                    draw.rectangle([top_left, bottom_right], outline="red", width=3)
+                    # Draw text with confidence
+                    text_with_conf = f"{text} ({confidence:.2f})"
+                    draw.text(top_left, text_with_conf, fill="blue", font=font)
+                    logger.info(f"Drew annotation for text region {i+1}")
+                except Exception as e:
+                    logger.error(f"Error drawing annotation for region {i+1}: {str(e)}")
+                    continue
+            # Convert back to numpy array
+            annotated_image = np.array(pil_image)
+            logger.info("Annotated image created successfully")
+        except Exception as e:
+            logger.error(f"Error creating annotated image: {str(e)}")
+            logger.error(traceback.format_exc())
+            annotated_image = image.copy()  # Return original image if annotation fails
+        # Join detected text with proper formatting
+        text_output = "\n".join(detected_text)
+        # Format tables for display
+        tables_output = ""
+        for i, table in enumerate(tables):
+            tables_output += f"Table {i+1}:\n"
+            tables_output += table.to_string(index=False, header=False) + "\n\n"
+        logger.info("OCR extraction completed successfully")
+        return text_output, tables_output, annotated_image
     except Exception as e:
+        error_msg = f"Unexpected error in OCR extraction: {str(e)}"
+        logger.error(error_msg)
+        logger.error(traceback.format_exc())
+        return f"Error: {error_msg}", "Processing failed", None
 # Create Gradio interface
+try:
+    logger.info("Creating Gradio interface...")
+    iface = gr.Interface(
+        fn=ocr_extract_text_and_tables,
+        inputs=gr.Image(type="numpy", label="Upload Image"),
+        outputs=[
+            gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
+            gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
+            gr.Image(label="Annotated Image")
+        ],
+        title="French OCR & Table Extractor",
+        description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
+        examples=[],  # You can add example images here
+        cache_examples=True
+    )
+    logger.info("Gradio interface created successfully")
+except Exception as e:
+    logger.error(f"Error creating Gradio interface: {str(e)}")
+    logger.error(traceback.format_exc())
 # Launch the interface
 if __name__ == "__main__":
+    try:
+        logger.info("Launching Gradio interface...")
+        iface.launch()
+        logger.info("Gradio interface launched successfully")
+    except Exception as e:
+        logger.error(f"Error launching Gradio interface: {str(e)}")
+        logger.error(traceback.format_exc())