File size: 4,454 Bytes
490d1f0
 
 
 
 
f547ea0
c9e9a90
 
 
f547ea0
37be9a5
c9e9a90
 
37be9a5
 
58317ea
37be9a5
c9e9a90
58317ea
37be9a5
f547ea0
37be9a5
 
 
 
490d1f0
37be9a5
 
 
 
 
 
 
 
 
 
 
490d1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37be9a5
490d1f0
37be9a5
490d1f0
 
37be9a5
490d1f0
 
 
 
37be9a5
 
 
490d1f0
 
c9e9a90
1d6b971
905c2c1
1d6b971
58317ea
c9e9a90
37be9a5
 
 
490d1f0
381620a
c9e9a90
 
37be9a5
905c2c1
c9e9a90
 
1d6b971
 
c9e9a90
58317ea
37be9a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117





import gradio as gr
from pix2text import Pix2Text
import logging
from PIL import Image

# Set up logging to WARNING level to suppress excessive output from model libraries
logging.basicConfig(level=logging.WARNING)

# Initialize Pix2Text model globally (expensive operation, do it once)
p2t = None
try:
    # Initialize the Pix2Text model
    p2t = Pix2Text()
except Exception as e:
    print(f"Error initializing Pix2Text model: {e}. Recognition will use a fallback function.")

# Define the main recognition function
def recognize_text(image_path: str) -> str:
    """
    Performs OCR on the uploaded image and safely parses the output.
    This function includes debugging to understand the result structure.
    """
    if p2t is None:
        return (
            "Model initialization failed at startup. Please check the logs "
            "to ensure all dependencies (like ONNX runtime) loaded correctly."
        )

    try:
        # Recognize text and formulas
        result = p2t.recognize(image_path, save_formula_images=False, use_analyzer=True)

        # DEBUG: Print the actual result structure
        print(f"DEBUG - Result type: {type(result)}")
        print(f"DEBUG - Result content: {result}")
        
        # Handle different possible return types
        if isinstance(result, str):
            # If result is directly a string
            return result if result.strip() else "No recognizable text or formulas found in the image."
        
        if isinstance(result, dict):
            # If result is a dictionary, try to extract text from common keys
            print(f"DEBUG - Result keys: {result.keys()}")
            possible_keys = ['text', 'content', 'result', 'output']
            for key in possible_keys:
                if key in result:
                    return str(result[key])
            return f"Result is a dict but couldn't find text. Keys: {list(result.keys())}"
        
        if isinstance(result, list):
            # If result is a list, process each item
            extracted_parts = []
            
            for i, item in enumerate(result):
                print(f"DEBUG - Item {i} type: {type(item)}")
                print(f"DEBUG - Item {i} content: {item}")
                
                if hasattr(item, 'text'):
                    # P2TOutput object (for formulas or structured text)
                    extracted_parts.append(item.text)
                elif isinstance(item, str):
                    # Simple text string
                    extracted_parts.append(item)
                elif isinstance(item, dict):
                    # Dictionary with text content
                    if 'text' in item:
                        extracted_parts.append(item['text'])
                    elif 'content' in item:
                        extracted_parts.append(item['content'])
                    else:
                        extracted_parts.append(str(item))
                else:
                    # Try to convert to string as fallback
                    extracted_parts.append(str(item))

            extracted_text = "\n\n".join(extracted_parts)

            if not extracted_text.strip():
                return "No recognizable text or formulas found in the image."

            return extracted_text
        
        # If none of the above, try to convert to string
        return str(result) if result else "No recognizable text or formulas found in the image."

    except Exception as e:
        # Catch any unexpected errors during the recognition process
        import traceback
        return f"An unexpected error occurred during recognition: {e}\n\nTraceback:\n{traceback.format_exc()}"


# --- Gradio Interface Setup ---

iface = gr.Interface(
    fn=recognize_text,
    # Use type="filepath" to send the local file path to the Python function
    inputs=gr.Image(type="filepath", label="Upload Image (Text/Formula/Math)"),
    # The output is a standard textbox
    outputs=gr.Textbox(label="Extracted Text (LaTeX/Plain Text)", lines=10),
    title="🔬 Pix2Text OCR Formula and Text Recognition",
    description=(
        "Upload an image containing text, mathematical formulas, or scientific notation. "
        "The app converts the image content into editable text, using LaTeX for formulas."
    ),
    theme=gr.themes.Soft(),
    allow_flagging="never",
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch(show_api=False)