import gradio as gr from pix2text import Pix2Text import logging from PIL import Image # Set up logging to WARNING level to suppress excessive output from model libraries logging.basicConfig(level=logging.WARNING) # Initialize Pix2Text model globally (expensive operation, do it once) p2t = None try: # Initialize the Pix2Text model p2t = Pix2Text() except Exception as e: print(f"Error initializing Pix2Text model: {e}. Recognition will use a fallback function.") # Define the main recognition function def recognize_text(image_path: str) -> str: """ Performs OCR on the uploaded image and safely parses the output. This function includes debugging to understand the result structure. """ if p2t is None: return ( "Model initialization failed at startup. Please check the logs " "to ensure all dependencies (like ONNX runtime) loaded correctly." ) try: # Recognize text and formulas result = p2t.recognize(image_path, save_formula_images=False, use_analyzer=True) # DEBUG: Print the actual result structure print(f"DEBUG - Result type: {type(result)}") print(f"DEBUG - Result content: {result}") # Handle different possible return types if isinstance(result, str): # If result is directly a string return result if result.strip() else "No recognizable text or formulas found in the image." if isinstance(result, dict): # If result is a dictionary, try to extract text from common keys print(f"DEBUG - Result keys: {result.keys()}") possible_keys = ['text', 'content', 'result', 'output'] for key in possible_keys: if key in result: return str(result[key]) return f"Result is a dict but couldn't find text. Keys: {list(result.keys())}" if isinstance(result, list): # If result is a list, process each item extracted_parts = [] for i, item in enumerate(result): print(f"DEBUG - Item {i} type: {type(item)}") print(f"DEBUG - Item {i} content: {item}") if hasattr(item, 'text'): # P2TOutput object (for formulas or structured text) extracted_parts.append(item.text) elif isinstance(item, str): # Simple text string extracted_parts.append(item) elif isinstance(item, dict): # Dictionary with text content if 'text' in item: extracted_parts.append(item['text']) elif 'content' in item: extracted_parts.append(item['content']) else: extracted_parts.append(str(item)) else: # Try to convert to string as fallback extracted_parts.append(str(item)) extracted_text = "\n\n".join(extracted_parts) if not extracted_text.strip(): return "No recognizable text or formulas found in the image." return extracted_text # If none of the above, try to convert to string return str(result) if result else "No recognizable text or formulas found in the image." except Exception as e: # Catch any unexpected errors during the recognition process import traceback return f"An unexpected error occurred during recognition: {e}\n\nTraceback:\n{traceback.format_exc()}" # --- Gradio Interface Setup --- iface = gr.Interface( fn=recognize_text, # Use type="filepath" to send the local file path to the Python function inputs=gr.Image(type="filepath", label="Upload Image (Text/Formula/Math)"), # The output is a standard textbox outputs=gr.Textbox(label="Extracted Text (LaTeX/Plain Text)", lines=10), title="🔬 Pix2Text OCR Formula and Text Recognition", description=( "Upload an image containing text, mathematical formulas, or scientific notation. " "The app converts the image content into editable text, using LaTeX for formulas." ), theme=gr.themes.Soft(), allow_flagging="never", ) # Launch the Gradio app if __name__ == "__main__": iface.launch(show_api=False)