Spaces:

iammraat
/

trOCR

Sleeping

File size: 7,636 Bytes


# import os
# import sys
# import numpy as np
# import cv2

# # ==========================================
# # 🔧 PATCH 1: Fix Torchvision Compatibility
# # ==========================================
# import torchvision.models.vgg
# if not hasattr(torchvision.models.vgg, 'model_urls'):
#     torchvision.models.vgg.model_urls = {
#         'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'
#     }

# # ==========================================
# # 🔧 PATCH 2: Fix NumPy Crash AND Coordinates
# # ==========================================
# import craft_text_detector.craft_utils as craft_utils_module

# # CRITICAL FIX: Added 'ratio_net=2' to the signature and calculation.
# # Without this, your boxes are 2x smaller than they should be.
# def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
#     if not polys:
#         return []
    
#     adjusted = []
#     for poly in polys:
#         if poly is None or len(poly) == 0:
#             continue
        
#         # Convert to numpy array safely
#         poly = np.array(poly).reshape(-1, 2)
        
#         # Scale coordinates: 
#         # 1. ratio_net scales from Heatmap -> Resized Image
#         # 2. ratio_w/h scales from Resized Image -> Original Image
#         poly[:, 0] *= (ratio_w * ratio_net)
#         poly[:, 1] *= (ratio_h * ratio_net)
        
#         adjusted.append(poly)
    
#     return adjusted

# # Apply the patch
# craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
# # ==========================================

# import gradio as gr
# from craft_hw_ocr import OCR

# print("⏳ Loading OCR models...")
# ocr = OCR.load_models()
# print("✅ Models loaded!")

# def do_ocr(inp):
#     if inp is None:
#         return None, "No image uploaded."
    
#     try:
#         # Detection
#         # 'detected_img' is usually just the input image in this library
#         detected_img, results = OCR.detection(inp, ocr[2])
        
#         # Visualization
#         # Now that coordinates are scaled correctly, this should look right
#         viz_img = OCR.visualize(inp, results)
        
#         # Recognition
#         try:
#             # Note: The library has a typo 'recoginition' (extra 'i')
#             bboxes, text = OCR.recoginition(detected_img, results, ocr[0], ocr[1])
#         except Exception as e:
#             print(f"Recognition error: {e}")
#             text = f"Detection successful, but recognition failed: {str(e)}"
        
#         return viz_img, text
        
#     except Exception as e:
#         print(f"OCR error: {e}")
#         return None, f"Error processing image: {str(e)}"
  
# inputs = gr.Image(label="Upload Image")
# o1 = gr.Image(label="Detections")
# o2 = gr.Textbox(label="Text")

# title = "CRAFT-OCR (Fixed Coords)"
# description = "Handwriting OCR using CRAFT + TrOCR. Patched for NumPy and Coordinates."

# gr.Interface(
#     fn=do_ocr, 
#     inputs=inputs, 
#     outputs=[o1, o2], 
#     title=title, 
#     description=description
# ).launch()







import os
import sys
import numpy as np
import cv2

# ==========================================
# 🔧 PATCH 1: Fix Torchvision Compatibility
# ==========================================
import torchvision.models.vgg
if not hasattr(torchvision.models.vgg, 'model_urls'):
    torchvision.models.vgg.model_urls = {
        'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'
    }

# ==========================================
# 🔧 PATCH 2: Fix NumPy Crash AND Coordinates
# ==========================================
import craft_text_detector.craft_utils as craft_utils_module

def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
    if not polys:
        return []
    
    adjusted = []
    for poly in polys:
        if poly is None or len(poly) == 0:
            continue
        
        # Convert to numpy array safely
        poly = np.array(poly).reshape(-1, 2)
        
        # Scale coordinates correctly
        poly[:, 0] *= (ratio_w * ratio_net)
        poly[:, 1] *= (ratio_h * ratio_net)
        
        adjusted.append(poly)
    
    return adjusted

craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
# ==========================================

import gradio as gr
from craft_hw_ocr import OCR
# Import the core prediction function to bypass the wrapper limitations
from craft_text_detector.predict import get_prediction

print("⏳ Loading OCR models...")
ocr = OCR.load_models()
# ocr[2] is the Craft object wrapper
craft_wrapper = ocr[2]
# We extract the actual networks to run them manually
craft_net = craft_wrapper.craft_net
refine_net = craft_wrapper.refine_net
print("✅ Models loaded!")

def do_ocr(inp, text_threshold, link_threshold, low_text):
    if inp is None:
        return None, "No image uploaded."
    
    try:
        print(f"⚙️ Running Direct Inference: Text={text_threshold}, Link={link_threshold}, Low={low_text}")
        
        # 1. Direct Detection (Bypassing Craft.detect_text)
        # This calls the engine directly, which accepts all our sliders.
        prediction_result = get_prediction(
            image=inp,
            craft_net=craft_net,
            refine_net=refine_net,
            text_threshold=text_threshold,
            link_threshold=link_threshold,
            low_text=low_text,
            cuda=False,  # Space is CPU
            poly=True
        )
        
        # 2. Visualization
        # OCR.visualize expects (image, results_dict)
        viz_img = OCR.visualize(inp, prediction_result)
        
        # 3. Recognition
        try:
            bboxes, text = OCR.recoginition(inp, prediction_result, ocr[0], ocr[1])
        except Exception as e:
            text = f"Detection successful, but recognition failed: {e}"
        
        return viz_img, text
        
    except Exception as e:
        print(f"OCR error: {e}")
        return None, f"Error processing image: {str(e)}"

# ------------------------------------------------------
# 🎛️ UI with Tuning Sliders
# ------------------------------------------------------
with gr.Blocks(title="CRAFT-OCR Tuner") as demo:
    gr.Markdown("## 🔧 CRAFT-OCR Parameter Tuner")
    gr.Markdown("Adjust sliders to fix issues like merged words or background noise.")
    
    with gr.Row():
        with gr.Column(scale=1):
            input_img = gr.Image(label="Upload Image")
            
            gr.Markdown("### 🎛️ Fine-Tune Detection")
            
            text_thres = gr.Slider(0.1, 0.9, value=0.7, step=0.05, label="Text Threshold", 
                                   info="Confidence to consider a pixel as text. Higher = Less Noise.")
            
            link_thres = gr.Slider(0.1, 0.9, value=0.4, step=0.05, label="Link Threshold", 
                                   info="Confidence to link characters. HIGHER value splits merged words (Fixes 'Hamburgthen').")
            
            low_text = gr.Slider(0.1, 0.9, value=0.4, step=0.05, label="Low Text Threshold", 
                                 info="Filters background noise. Higher = Cleaner background.")
            
            btn = gr.Button("Run OCR", variant="primary")
        
        with gr.Column(scale=1):
            viz_output = gr.Image(label="Detections (Verify Boxes)")
            text_output = gr.Textbox(label="Recognized Text", lines=10)

    btn.click(
        fn=do_ocr, 
        inputs=[input_img, text_thres, link_thres, low_text], 
        outputs=[viz_output, text_output]
    )

if __name__ == "__main__":
    demo.launch()