Spaces:

wkplhc
/

ocr

Sleeping

App Files Files Community

wkplhc commited on Sep 15, 2025

Commit

5ddadc9

verified ·

1 Parent(s): a9a8515

Create app.py

Browse files

Files changed (1) hide show

app.py +220 -0

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import re
+import os
+import tempfile
+import pytesseract
+from PIL import Image, ImageEnhance, ImageFilter
+import cv2
+import numpy as np
+from urllib.parse import urlparse
+import time
+import shutil
+# 确保中文显示正常
+import matplotlib.pyplot as plt
+plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
+# 设置Tesseract OCR路径（Hugging Face Spaces上已预安装）
+try:
+    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
+except:
+    pass  # 在Windows上可能需要手动设置
+def extract_gif_urls(html_content):
+    """从HTML内容中提取符合条件的GIF图片URL"""
+    soup = BeautifulSoup(html_content, 'html.parser')
+    img_tags = soup.find_all('img')
+    gif_urls = []
+    # 匹配霹雳布袋戏相关的GIF格式，特别是0101.gif这类序列
+    pattern = r'010\d+\.gif$'
+    for img in img_tags:
+        src = img.get('src', '')
+        if src and re.search(pattern, src, re.IGNORECASE):
+            # 处理相对路径
+            if not src.startswith(('http://', 'https://')):
+                continue  # 简单处理，实际可能需要更复杂的URL拼接
+            gif_urls.append(src)
+    # 按文件名排序（0101.gif, 0102.gif...）
+    gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
+    return gif_urls
+def download_gif(url, save_path):
+    """下载GIF图片"""
+    try:
+        response = requests.get(url, stream=True, timeout=10)
+        if response.status_code == 200:
+            with open(save_path, 'wb') as f:
+                f.write(response.content)
+            return True
+        return False
+    except:
+        return False
+def process_gif_for_ocr(gif_path):
+    """处理GIF图片以提高OCR识别率"""
+    # 打开GIF
+    gif = Image.open(gif_path)
+    # 提取第一帧（通常文本在第一帧）
+    try:
+        gif.seek(0)
+        frame = gif.convert('L')  # 转为灰度图
+        # 增强对比度
+        enhancer = ImageEnhance.Contrast(frame)
+        frame = enhancer.enhance(2.0)
+        # 轻微锐化
+        frame = frame.filter(ImageFilter.SHARPEN)
+        # 二值化处理
+        threshold = 150
+        frame = frame.point(lambda p: p > threshold and 255)
+        return frame
+    except EOFError:
+        return None
+def ocr_image(image):
+    """对处理后的图像进行OCR识别"""
+    if image is None:
+        return ""
+    # 使用Tesseract进行OCR，指定中文识别
+    custom_config = r'--oem 3 --psm 6 -l chi_sim+eng'
+    text = pytesseract.image_to_string(image, config=custom_config)
+    # 清理识别结果
+    text = text.replace('\f', '').replace('\n\n', '\n').strip()
+    return text
+def extract_text_from_url(url, progress=gr.Progress()):
+    """从指定URL提取GIF并识别文本"""
+    try:
+        # 创建临时目录
+        with tempfile.TemporaryDirectory() as temp_dir:
+            progress(0, desc="正在获取网页内容...")
+            # 获取网页内容
+            response = requests.get(url, timeout=15)
+            if response.status_code != 200:
+                return f"无法访问网页，状态码：{response.status_code}"
+            # 提取GIF URL
+            progress(0.2, desc="正在提取GIF图片链接...")
+            gif_urls = extract_gif_urls(response.text)
+            if not gif_urls:
+                return "未找到符合条件的GIF图片"
+            progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")
+            # 下载并处理每个GIF
+            all_text = []
+            gif_images = []
+            step = 0.7 / len(gif_urls)
+            current_progress = 0.3
+            for i, gif_url in enumerate(gif_urls):
+                # 更新进度
+                current_progress += step
+                progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
+                # 提取文件名
+                parsed_url = urlparse(gif_url)
+                filename = os.path.basename(parsed_url.path)
+                # 下载GIF
+                gif_path = os.path.join(temp_dir, filename)
+                if not download_gif(gif_url, gif_path):
+                    all_text.append(f"【{filename}】下载失败")
+                    continue
+                # 处理GIF以提高OCR识别率
+                processed_image = process_gif_for_ocr(gif_path)
+                if processed_image is None:
+                    all_text.append(f"【{filename}】处理失败")
+                    continue
+                # 保存处理后的图像用于展示
+                processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
+                processed_image.save(processed_path)
+                gif_images.append(processed_path)
+                # 识别文本
+                text = ocr_image(processed_image)
+                all_text.append(f"【{filename}】\n{text}")
+                # 避免请求过于频繁
+                time.sleep(0.5)
+            # 拼接所有文本
+            result_text = "\n\n".join(all_text)
+            progress(1.0, desc="处理完成")
+            return result_text, [Image.open(img_path) for img_path in gif_images]
+    except Exception as e:
+        return f"处理过程出错：{str(e)}", []
+def create_interface():
+    """创建Gradio界面"""
+    with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo:
+        gr.Markdown("""
+        # 霹雳布袋戏GIF文本提取工具
+        这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片，并识别其中的文本内容。
+        ## 使用方法：
+        1. 输入包含GIF的网页URL（例如：https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM）
+        2. 点击"提取文本"按钮
+        3. 等待处理完成，查看识别结果
+        """)
+        with gr.Row():
+            url_input = gr.Textbox(
+                label="网页URL",
+                placeholder="请输入包含GIF的网页地址",
+                value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM"
+            )
+        with gr.Row():
+            extract_btn = gr.Button("提取文本", variant="primary")
+        with gr.Row():
+            with gr.Column(scale=1):
+                result_text = gr.Textbox(label="识别结果", lines=20)
+            with gr.Column(scale=1):
+                processed_images = gr.Gallery(
+                    label="处理后的GIF帧",
+                    show_label=True,
+                    elem_id="gallery"
+                ).style(grid=[2], height="auto")
+        with gr.Row():
+            gr.Markdown("""
+            ## 注意事项：
+            - 识别 accuracy 取决于GIF图片的清晰度
+            - 处理可能需要几分钟时间，请耐心等待
+            - 如遇网络问题，请检查URL是否正确或稍后重试
+            """)
+        # 设置事件
+        extract_btn.click(
+            fn=extract_text_from_url,
+            inputs=[url_input],
+            outputs=[result_text, processed_images]
+        )
+    return demo
+# 创建并启动界面
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()