pixelplanet / app.py
m2zm's picture
Update app.py
f64a374 verified
import base64
import uuid
import cairosvg
import cv2
import numpy as np
import re
import torch
from PIL import Image, ImageEnhance, ImageFilter
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
import gradio as gr
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3")
model = VisionEncoderDecoderModel.from_pretrained(
"anuashok/ocr-captcha-v3",
torch_dtype=torch.float16
).to(device)
model.eval()
def advanced_preprocess(cv_image):
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
gray = cv2.bilateralFilter(gray, 5, 75, 75)
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
pil = Image.fromarray(thresh).convert("RGB")
pil = pil.filter(ImageFilter.SHARPEN)
pil = ImageEnhance.Contrast(pil).enhance(2.5)
return pil
def run_ocr(pil_image):
pixel_values = processor(images=pil_image, return_tensors="pt").pixel_values.to(device)
with torch.no_grad():
generated_ids = model.generate(
pixel_values,
max_length=4,
min_length=4,
num_beams=3,
no_repeat_ngram_size=2,
early_stopping=True,
length_penalty=1.0,
repetition_penalty=1.5
)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
return re.sub(r'[^A-Za-z0-9]', '', text.upper())
def genRotations(svg):
disable_anim = re.sub(r'<animateTransform type="rotate" repeatCount="indefinite" attributeName="transform" from="\d+ \d+,\d+" to="\d+ \d+ \d+" begin="\d+" dur="\d+s"/>', '', svg)
matches = re.findall(r"rotate\((1|-1), (\d+), (\d+)\)", disable_anim)
if not matches or len(matches) < 2:
return ""
firstcoords = matches[0][1]
secondcoords = matches[-1][1]
def create_rotated_image(svg_code, angle_pos, coords):
temp1 = re.sub(rf'rotate\(1, {coords}, 150\)', f'rotate({angle_pos}, {coords}, 150)', svg_code)
temp2 = re.sub(rf'rotate\(-1, {coords}, 150\)', f'rotate(-{angle_pos}, {coords}, 150)', temp1)
image_data = cairosvg.svg2png(bytestring=temp2.encode('utf-8'))
nparr = np.frombuffer(image_data, np.uint8)
return cv2.imdecode(nparr, cv2.IMREAD_COLOR)
def combine_images(base_img, part_img, width=68):
combined_img = base_img.copy()
fc = int(firstcoords)
crop_img = part_img[:, fc - width:fc + width]
combined_img[:, fc - width:fc + width] = crop_img
return combined_img
angles = [0, 45, 90, 135, 180, 225, 270, 315]
for a in angles:
img1 = create_rotated_image(disable_anim, a, firstcoords)
img2 = create_rotated_image(disable_anim, a, secondcoords)
combo = combine_images(img2, img1)
processed = advanced_preprocess(combo)
res = run_ocr(processed)
if len(res) == 4 and res.isalnum():
return res
return ""
def predict(svg_text):
request_id = str(uuid.uuid4())
print(f"Yeni istek geldi. ID: {request_id}")
text = svg_text.strip()
if not text:
print(f"OCR cevabı döndürüldü. ID: {request_id}, Cevap: Empty input")
return "Empty input"
if text.startswith('data:image/svg+xml;base64,'):
b = base64.b64decode(text.split(',')[-1])
svg = b.decode('utf-8')
else:
svg = text
if len(svg) > 30000:
print(f"OCR cevabı döndürüldü. ID: {request_id}, Cevap: Too large")
return "Too large"
result = genRotations(svg)
print(f"OCR cevabı döndürüldü. ID: {request_id}, Cevap: {result}")
return result if result else "XXXX"
demo = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="SVG", lines=6, placeholder="SVG to PNG..."),
outputs=gr.Textbox(label="Solution"),
title="Captcha Solver",
)
demo.launch()