wkplhc commited on
Commit
5ddadc9
·
verified ·
1 Parent(s): a9a8515

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -0
app.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ import os
6
+ import tempfile
7
+ import pytesseract
8
+ from PIL import Image, ImageEnhance, ImageFilter
9
+ import cv2
10
+ import numpy as np
11
+ from urllib.parse import urlparse
12
+ import time
13
+ import shutil
14
+
15
+ # 确保中文显示正常
16
+ import matplotlib.pyplot as plt
17
+ plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
18
+
19
+ # 设置Tesseract OCR路径(Hugging Face Spaces上已预安装)
20
+ try:
21
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
22
+ except:
23
+ pass # 在Windows上可能需要手动设置
24
+
25
+ def extract_gif_urls(html_content):
26
+ """从HTML内容中提取符合条件的GIF图片URL"""
27
+ soup = BeautifulSoup(html_content, 'html.parser')
28
+ img_tags = soup.find_all('img')
29
+
30
+ gif_urls = []
31
+ # 匹配霹雳布袋戏相关的GIF格式,特别是0101.gif这类序列
32
+ pattern = r'010\d+\.gif$'
33
+
34
+ for img in img_tags:
35
+ src = img.get('src', '')
36
+ if src and re.search(pattern, src, re.IGNORECASE):
37
+ # 处理相对路径
38
+ if not src.startswith(('http://', 'https://')):
39
+ continue # 简单处理,实际可能需要更复杂的URL拼接
40
+ gif_urls.append(src)
41
+
42
+ # 按文件名排序(0101.gif, 0102.gif...)
43
+ gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
44
+ return gif_urls
45
+
46
+ def download_gif(url, save_path):
47
+ """下载GIF图片"""
48
+ try:
49
+ response = requests.get(url, stream=True, timeout=10)
50
+ if response.status_code == 200:
51
+ with open(save_path, 'wb') as f:
52
+ f.write(response.content)
53
+ return True
54
+ return False
55
+ except:
56
+ return False
57
+
58
+ def process_gif_for_ocr(gif_path):
59
+ """处理GIF图片以提高OCR识别率"""
60
+ # 打开GIF
61
+ gif = Image.open(gif_path)
62
+
63
+ # 提取第一帧(通常文本在第一帧)
64
+ try:
65
+ gif.seek(0)
66
+ frame = gif.convert('L') # 转为灰度图
67
+
68
+ # 增强对比度
69
+ enhancer = ImageEnhance.Contrast(frame)
70
+ frame = enhancer.enhance(2.0)
71
+
72
+ # 轻微锐化
73
+ frame = frame.filter(ImageFilter.SHARPEN)
74
+
75
+ # 二值化处理
76
+ threshold = 150
77
+ frame = frame.point(lambda p: p > threshold and 255)
78
+
79
+ return frame
80
+ except EOFError:
81
+ return None
82
+
83
+ def ocr_image(image):
84
+ """对处理后的图像进行OCR识别"""
85
+ if image is None:
86
+ return ""
87
+
88
+ # 使用Tesseract进行OCR,指定中文识别
89
+ custom_config = r'--oem 3 --psm 6 -l chi_sim+eng'
90
+ text = pytesseract.image_to_string(image, config=custom_config)
91
+
92
+ # 清理识别结果
93
+ text = text.replace('\f', '').replace('\n\n', '\n').strip()
94
+ return text
95
+
96
+ def extract_text_from_url(url, progress=gr.Progress()):
97
+ """从指定URL提取GIF并识别文本"""
98
+ try:
99
+ # 创建临时目录
100
+ with tempfile.TemporaryDirectory() as temp_dir:
101
+ progress(0, desc="正在获取网页内容...")
102
+
103
+ # 获取网页内容
104
+ response = requests.get(url, timeout=15)
105
+ if response.status_code != 200:
106
+ return f"无法访问网页,状态码:{response.status_code}"
107
+
108
+ # 提取GIF URL
109
+ progress(0.2, desc="正在提取GIF图片链接...")
110
+ gif_urls = extract_gif_urls(response.text)
111
+
112
+ if not gif_urls:
113
+ return "未找到符合条件的GIF图片"
114
+
115
+ progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
116
+
117
+ # 下载并处理每个GIF
118
+ all_text = []
119
+ gif_images = []
120
+ step = 0.7 / len(gif_urls)
121
+ current_progress = 0.3
122
+
123
+ for i, gif_url in enumerate(gif_urls):
124
+ # 更新进度
125
+ current_progress += step
126
+ progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
127
+
128
+ # 提取文件名
129
+ parsed_url = urlparse(gif_url)
130
+ filename = os.path.basename(parsed_url.path)
131
+
132
+ # 下载GIF
133
+ gif_path = os.path.join(temp_dir, filename)
134
+ if not download_gif(gif_url, gif_path):
135
+ all_text.append(f"【{filename}】下载失败")
136
+ continue
137
+
138
+ # 处理GIF以提高OCR识别率
139
+ processed_image = process_gif_for_ocr(gif_path)
140
+ if processed_image is None:
141
+ all_text.append(f"【{filename}】处理失败")
142
+ continue
143
+
144
+ # 保存处理后的图像用于展示
145
+ processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
146
+ processed_image.save(processed_path)
147
+ gif_images.append(processed_path)
148
+
149
+ # 识别文本
150
+ text = ocr_image(processed_image)
151
+ all_text.append(f"【{filename}】\n{text}")
152
+
153
+ # 避免请求过于频繁
154
+ time.sleep(0.5)
155
+
156
+ # 拼接所有文本
157
+ result_text = "\n\n".join(all_text)
158
+
159
+ progress(1.0, desc="处理完成")
160
+ return result_text, [Image.open(img_path) for img_path in gif_images]
161
+
162
+ except Exception as e:
163
+ return f"处理过程出错:{str(e)}", []
164
+
165
+ def create_interface():
166
+ """创建Gradio界面"""
167
+ with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo:
168
+ gr.Markdown("""
169
+ # 霹雳布袋戏GIF文本提取工具
170
+
171
+ 这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。
172
+
173
+ ## 使用方法:
174
+ 1. 输入包含GIF的网页URL(例如:https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM)
175
+ 2. 点击"提取文本"按钮
176
+ 3. 等待处理完成,查看识别结果
177
+ """)
178
+
179
+ with gr.Row():
180
+ url_input = gr.Textbox(
181
+ label="网页URL",
182
+ placeholder="请输入包含GIF的网页地址",
183
+ value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM"
184
+ )
185
+
186
+ with gr.Row():
187
+ extract_btn = gr.Button("提取文本", variant="primary")
188
+
189
+ with gr.Row():
190
+ with gr.Column(scale=1):
191
+ result_text = gr.Textbox(label="识别结果", lines=20)
192
+
193
+ with gr.Column(scale=1):
194
+ processed_images = gr.Gallery(
195
+ label="处理后的GIF帧",
196
+ show_label=True,
197
+ elem_id="gallery"
198
+ ).style(grid=[2], height="auto")
199
+
200
+ with gr.Row():
201
+ gr.Markdown("""
202
+ ## 注意事项:
203
+ - 识别 accuracy 取决于GIF图片的清晰度
204
+ - 处理可能需要几分钟时间,请耐心等待
205
+ - 如遇网络问题,请检查URL是否正确或稍后重试
206
+ """)
207
+
208
+ # 设置事件
209
+ extract_btn.click(
210
+ fn=extract_text_from_url,
211
+ inputs=[url_input],
212
+ outputs=[result_text, processed_images]
213
+ )
214
+
215
+ return demo
216
+
217
+ # 创建并启动界面
218
+ if __name__ == "__main__":
219
+ demo = create_interface()
220
+ demo.launch()