tregu0458 commited on
Commit
c6a2c26
·
verified ·
1 Parent(s): 0a3c5b4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -0
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import tempfile
4
+ from typing import List, Tuple, Optional, Dict
5
+ from urllib.parse import urlparse
6
+ import importlib
7
+ import traceback
8
+
9
+ # Base required imports
10
+ import gradio as gr
11
+
12
+ # Dictionary of required packages for each file type
13
+ REQUIRED_PACKAGES = {
14
+ 'url': ['langchain_community', 'requests', 'bs4'],
15
+ 'pdf': ['langchain_community', 'pypdf'],
16
+ 'docx': ['langchain_community', 'unstructured']
17
+ }
18
+
19
+ def check_and_import_packages(file_type: str) -> Tuple[bool, str, Optional[Exception]]:
20
+ """
21
+ Check and import required packages for a specific file type.
22
+ Returns (success, error_message, exception)
23
+ """
24
+ if file_type not in REQUIRED_PACKAGES:
25
+ return True, "", None
26
+
27
+ missing_packages = []
28
+ for package in REQUIRED_PACKAGES[file_type]:
29
+ if not importlib.util.find_spec(package):
30
+ missing_packages.append(package)
31
+
32
+ if missing_packages:
33
+ error_msg = (f"ERROR: Missing required packages for {file_type} processing:\n"
34
+ f" - Missing: {', '.join(missing_packages)}\n"
35
+ f" - Install with: pip install {' '.join(missing_packages)}")
36
+ return False, error_msg, None
37
+
38
+ try:
39
+ if file_type == 'url':
40
+ from langchain_community.document_loaders import WebBaseLoader
41
+ elif file_type == 'pdf':
42
+ from langchain_community.document_loaders import PyPDFLoader
43
+ elif file_type == 'docx':
44
+ from langchain_community.document_loaders import UnstructuredWordDocumentLoader
45
+ return True, "", None
46
+ except Exception as e:
47
+ return False, f"ERROR: Failed to import required modules for {file_type}:\n {str(e)}", e
48
+
49
+ def count_characters(text: str) -> Dict[str, int]:
50
+ """Count characters in text."""
51
+ if not text:
52
+ return {
53
+ 'total': 0,
54
+ 'excluding_spaces': 0,
55
+ 'japanese': 0
56
+ }
57
+ return {
58
+ 'total': len(text),
59
+ 'excluding_spaces': len(text.replace(' ', '').replace('\n', '').replace('\t', '')),
60
+ 'japanese': len([c for c in text if '\u4e00' <= c <= '\u9fff' or '\u3040' <= c <= '\u309f' or '\u30a0' <= c <= '\u30ff'])
61
+ }
62
+
63
+ def format_char_count(counts: Dict[str, int]) -> str:
64
+ """Format character count information."""
65
+ return (f"文字数(スペース・改行含む): {counts['total']}\n"
66
+ f"文字数(スペース・改行除く): {counts['excluding_spaces']}\n"
67
+ f"日本語文字数: {counts['japanese']}")
68
+
69
+ def process_raw_text(text: str) -> Tuple[str, List[str]]:
70
+ """Process raw text input."""
71
+ errors = []
72
+ if not text or not text.strip():
73
+ return "", errors
74
+
75
+ try:
76
+ return f"\n=== Raw Text Input ===\n{text.strip()}\n", errors
77
+ except Exception as e:
78
+ errors.append(f"ERROR: Failed to process raw text input:\n {str(e)}")
79
+ return "", errors
80
+
81
+ def is_valid_url(url: str) -> bool:
82
+ """Validate if the given string is a valid URL."""
83
+ try:
84
+ result = urlparse(url)
85
+ return all([result.scheme, result.netloc])
86
+ except:
87
+ return False
88
+
89
+ def process_urls(urls: str) -> Tuple[str, List[str]]:
90
+ """Extract text from URLs using WebBaseLoader."""
91
+ errors = []
92
+ if not urls.strip():
93
+ return "", errors
94
+
95
+ success, error_msg, _ = check_and_import_packages('url')
96
+ if not success:
97
+ errors.append(error_msg)
98
+ return "", errors
99
+
100
+ from langchain_community.document_loaders import WebBaseLoader
101
+
102
+ combined_text = ""
103
+ url_list = [url.strip() for url in urls.split('\n') if url.strip()]
104
+
105
+ for url in url_list:
106
+ if not is_valid_url(url):
107
+ errors.append(f"ERROR: Invalid URL format: {url}")
108
+ continue
109
+
110
+ try:
111
+ loader = WebBaseLoader(url)
112
+ documents = loader.load()
113
+ combined_text += f"\n=== Content from URL: {url} ===\n"
114
+ for doc in documents:
115
+ combined_text += doc.page_content + "\n"
116
+ except Exception as e:
117
+ errors.append(f"ERROR: Failed to process URL {url}:\n {str(e)}\n {traceback.format_exc()}")
118
+
119
+ return combined_text, errors
120
+
121
+ def process_txt(txt_file: tempfile._TemporaryFileWrapper) -> Tuple[str, List[str]]:
122
+ """Process text files directly."""
123
+ errors = []
124
+ try:
125
+ with open(txt_file.name, 'r', encoding='utf-8') as f:
126
+ content = f.read()
127
+ return f"\n=== Content from TXT: {os.path.basename(txt_file.name)} ===\n{content}\n", errors
128
+ except UnicodeDecodeError:
129
+ try:
130
+ # Try with different encoding if UTF-8 fails
131
+ with open(txt_file.name, 'r', encoding='shift-jis') as f:
132
+ content = f.read()
133
+ return f"\n=== Content from TXT: {os.path.basename(txt_file.name)} ===\n{content}\n", errors
134
+ except Exception as e:
135
+ errors.append(f"ERROR: Failed to process text file {os.path.basename(txt_file.name)} (encoding error):\n {str(e)}\n {traceback.format_exc()}")
136
+ return "", errors
137
+ except Exception as e:
138
+ errors.append(f"ERROR: Failed to process text file {os.path.basename(txt_file.name)}:\n {str(e)}\n {traceback.format_exc()}")
139
+ return "", errors
140
+
141
+ def process_pdf(pdf_file: tempfile._TemporaryFileWrapper) -> Tuple[str, List[str]]:
142
+ """Extract text from a PDF file using PyPDFLoader."""
143
+ errors = []
144
+ success, error_msg, _ = check_and_import_packages('pdf')
145
+ if not success:
146
+ errors.append(error_msg)
147
+ return "", errors
148
+
149
+ from langchain_community.document_loaders import PyPDFLoader
150
+
151
+ try:
152
+ loader = PyPDFLoader(pdf_file.name)
153
+ documents = loader.load()
154
+
155
+ content = ""
156
+ for doc in documents:
157
+ content += doc.page_content + "\n"
158
+
159
+ return f"\n=== Content from PDF: {os.path.basename(pdf_file.name)} ===\n{content}", errors
160
+ except Exception as e:
161
+ errors.append(f"ERROR: Failed to process PDF {os.path.basename(pdf_file.name)}:\n {str(e)}\n {traceback.format_exc()}")
162
+ return "", errors
163
+
164
+ def process_docx(docx_file: tempfile._TemporaryFileWrapper) -> Tuple[str, List[str]]:
165
+ """Extract text from a DOCX file using UnstructuredWordDocumentLoader."""
166
+ errors = []
167
+ success, error_msg, _ = check_and_import_packages('docx')
168
+ if not success:
169
+ errors.append(error_msg)
170
+ return "", errors
171
+
172
+ from langchain_community.document_loaders import UnstructuredWordDocumentLoader
173
+
174
+ try:
175
+ loader = UnstructuredWordDocumentLoader(docx_file.name)
176
+ documents = loader.load()
177
+
178
+ content = ""
179
+ for doc in documents:
180
+ content += doc.page_content + "\n"
181
+
182
+ return f"\n=== Content from DOCX: {os.path.basename(docx_file.name)} ===\n{content}", errors
183
+ except Exception as e:
184
+ errors.append(f"ERROR: Failed to process DOCX {os.path.basename(docx_file.name)}:\n {str(e)}\n {traceback.format_exc()}")
185
+ return "", errors
186
+
187
+ def process_file(file: tempfile._TemporaryFileWrapper) -> Tuple[str, List[str]]:
188
+ """Process a file based on its extension."""
189
+ errors = []
190
+ if not file:
191
+ return "", errors
192
+
193
+ file_ext = os.path.splitext(file.name)[1].lower()
194
+
195
+ # Process based on file extension
196
+ if file_ext == '.txt':
197
+ return process_txt(file)
198
+ elif file_ext == '.pdf':
199
+ return process_pdf(file)
200
+ elif file_ext in ['.doc', '.docx']:
201
+ return process_docx(file)
202
+ else:
203
+ return "", [f"ERROR: Unsupported file type: {file_ext}"]
204
+
205
+ def combine_content(raw_text: str, url_input: str, files: List[tempfile._TemporaryFileWrapper]) -> Tuple[str, str, str, str]:
206
+ """Combine content from all sources into a single text file."""
207
+ combined_text = ""
208
+ all_errors = []
209
+
210
+ # Process raw text if provided
211
+ if raw_text:
212
+ text_content, text_errors = process_raw_text(raw_text)
213
+ combined_text += text_content
214
+ all_errors.extend(text_errors)
215
+
216
+ # Process URLs if provided
217
+ if url_input:
218
+ url_text, url_errors = process_urls(url_input)
219
+ combined_text += url_text
220
+ all_errors.extend(url_errors)
221
+
222
+ # Process each uploaded file
223
+ if files:
224
+ for file in files:
225
+ file_text, file_errors = process_file(file)
226
+ combined_text += file_text
227
+ all_errors.extend(file_errors)
228
+
229
+ # Calculate character count
230
+ char_counts = count_characters(combined_text)
231
+ char_count_text = format_char_count(char_counts)
232
+
233
+ if not combined_text.strip():
234
+ if all_errors:
235
+ combined_text = "No content was extracted due to errors. Please check the error messages below."
236
+ else:
237
+ combined_text = "No content was extracted. Please provide some input (text, URLs, or files)."
238
+
239
+ # Format error messages
240
+ error_text = "\n".join(all_errors) if all_errors else "処理は正常に完了しました。"
241
+
242
+ # Save to temporary file for download
243
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as tmp_file:
244
+ tmp_file.write(combined_text)
245
+ output_path = tmp_file.name
246
+
247
+ return combined_text, output_path, char_count_text, error_text
248
+
249
+ # Create Gradio interface
250
+ with gr.Blocks(title="Document Content Extractor") as demo:
251
+ gr.Markdown("# Document Content Extractor")
252
+ gr.Markdown("""テキスト、URL、各種ドキュメントからテキストを抽出・結合するツールです。
253
+
254
+ 対応ファイル形式:
255
+ - テキストファイル (.txt)
256
+ - PDFファイル (.pdf) - pypdfが必要
257
+ - Wordドキュメント (.doc, .docx) - unstructuredが必要
258
+
259
+ 必要なパッケージ:
260
+ - URL処理用: langchain-community, requests, beautifulsoup4
261
+ - PDF処理用: langchain-community, pypdf
262
+ - DOCX処理用: langchain-community, unstructured""")
263
+
264
+ with gr.Row():
265
+ with gr.Column():
266
+ raw_text = gr.Textbox(
267
+ label="テキスト入力",
268
+ placeholder="直接テキストを入力できます...",
269
+ lines=5
270
+ )
271
+ url_input = gr.Textbox(
272
+ label="URL入力(1行に1つ)",
273
+ placeholder="URLを入力してください...",
274
+ lines=5
275
+ )
276
+ files = gr.File(
277
+ label="ファイルアップロード",
278
+ file_count="multiple",
279
+ file_types=[".txt", ".pdf", ".doc", ".docx"]
280
+ )
281
+ combine_btn = gr.Button("抽出・結合")
282
+
283
+ with gr.Column():
284
+ error_output = gr.Textbox(
285
+ label="エラー・警告",
286
+ lines=3,
287
+ interactive=False,
288
+ show_copy_button=True
289
+ )
290
+ char_count_output = gr.Textbox(
291
+ label="文字数",
292
+ lines=3,
293
+ interactive=False
294
+ )
295
+ text_output = gr.Textbox(
296
+ label="抽出されたテキスト",
297
+ lines=20,
298
+ interactive=False,
299
+ show_copy_button=True
300
+ )
301
+ file_output = gr.File(label="結合テキストをダウンロード")
302
+
303
+ combine_btn.click(
304
+ fn=combine_content,
305
+ inputs=[raw_text, url_input, files],
306
+ outputs=[text_output, file_output, char_count_output, error_output]
307
+ )
308
+
309
+ if __name__ == "__main__":
310
+ demo.launch()