openfree commited on
Commit
cca699c
ยท
verified ยท
1 Parent(s): 7156e9b

Upload text_utils.py

Browse files
Files changed (1) hide show
  1. text_utils.py +375 -0
text_utils.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # text_utils.py
3
+ # ํŒŒ์ผ ์ถ”์ถœ, ์›น ๊ฒ€์ƒ‰, ๊ธฐ๋ณธ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜๋“ค
4
+ # ============================================
5
+
6
+ import re, os, json, time, zipfile, tempfile, zlib
7
+ from pathlib import Path
8
+ from collections import Counter
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from xml.etree import ElementTree as ET
11
+
12
+ try:
13
+ import httpx
14
+ HAS_HTTPX = True
15
+ except ImportError:
16
+ HAS_HTTPX = False
17
+
18
+ try:
19
+ import pdfplumber
20
+ HAS_PDFPLUMBER = True
21
+ except ImportError:
22
+ HAS_PDFPLUMBER = False
23
+
24
+ try:
25
+ import PyPDF2
26
+ HAS_PYPDF2 = True
27
+ except ImportError:
28
+ HAS_PYPDF2 = False
29
+
30
+ try:
31
+ from docx import Document as DocxDocument
32
+ HAS_DOCX = True
33
+ except ImportError:
34
+ HAS_DOCX = False
35
+
36
+ try:
37
+ import olefile
38
+ HAS_OLEFILE = True
39
+ except ImportError:
40
+ HAS_OLEFILE = False
41
+
42
+ # ============================================
43
+ # ํŒŒ์ผ ์ถ”์ถœ ํ•จ์ˆ˜๋“ค
44
+ # ============================================
45
+
46
+ def extract_text_from_pdf(file_path):
47
+ """PDF โ†’ ํ…์ŠคํŠธ"""
48
+ pages = []
49
+ if HAS_PDFPLUMBER:
50
+ try:
51
+ with pdfplumber.open(file_path) as pdf:
52
+ for p in pdf.pages:
53
+ t = p.extract_text()
54
+ if t: pages.append(t)
55
+ if pages: return pages, None
56
+ except Exception as e:
57
+ print(f"pdfplumber: {e}")
58
+ if HAS_PYPDF2:
59
+ try:
60
+ with open(file_path, 'rb') as f:
61
+ reader = PyPDF2.PdfReader(f)
62
+ for p in reader.pages:
63
+ t = p.extract_text()
64
+ if t: pages.append(t)
65
+ if pages: return pages, None
66
+ except Exception as e:
67
+ print(f"PyPDF2: {e}")
68
+ return None, "PDF ์ถ”์ถœ ์‹คํŒจ"
69
+
70
+ def extract_text_from_docx(file_path):
71
+ """DOCX โ†’ ํ…์ŠคํŠธ"""
72
+ if not HAS_DOCX: return None, "python-docx ์—†์Œ"
73
+ try:
74
+ doc = DocxDocument(file_path)
75
+ sections = []
76
+ current = []
77
+ for para in doc.paragraphs:
78
+ txt = para.text.strip()
79
+ if not txt:
80
+ if current:
81
+ sections.append('\n'.join(current))
82
+ current = []
83
+ else:
84
+ current.append(txt)
85
+ if current: sections.append('\n'.join(current))
86
+ if sections: return sections, None
87
+ return None, "DOCX ํ…์ŠคํŠธ ์—†์Œ"
88
+ except Exception as e:
89
+ return None, f"DOCX ์˜ค๋ฅ˜: {e}"
90
+
91
+ def extract_text_from_txt(file_path):
92
+ """TXT/MD/CSV ๋“ฑ"""
93
+ for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
94
+ try:
95
+ with open(file_path, 'r', encoding=enc) as f:
96
+ text = f.read()
97
+ if text.strip():
98
+ sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
99
+ return sections if sections else [text], None
100
+ except: continue
101
+ return None, "ํ…์ŠคํŠธ ์ธ์ฝ”๋”ฉ ์‹คํŒจ"
102
+
103
+ def extract_text_from_hwpx(file_path):
104
+ """HWPX(ํ•œ๊ธ€ 2007 ์ด์ƒ) โ†’ ํ…์ŠคํŠธ"""
105
+ try:
106
+ text_parts = []
107
+ with zipfile.ZipFile(file_path, 'r') as zf:
108
+ file_list = zf.namelist()
109
+ section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
110
+ if not section_files:
111
+ section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
112
+ for sf_name in section_files:
113
+ try:
114
+ with zf.open(sf_name) as sf:
115
+ content = sf.read().decode('utf-8', errors='ignore')
116
+ content = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content)
117
+ content = re.sub(r'<[a-zA-Z]+:', '<', content)
118
+ content = re.sub(r'</[a-zA-Z]+:', '</', content)
119
+ try:
120
+ root = ET.fromstring(content)
121
+ texts = []
122
+ for elem in root.iter():
123
+ if elem.tag.endswith('t') or elem.tag == 't':
124
+ if elem.text: texts.append(elem.text)
125
+ elif elem.text and elem.text.strip():
126
+ if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
127
+ texts.append(elem.text.strip())
128
+ if texts: text_parts.append(' '.join(texts))
129
+ except ET.ParseError:
130
+ matches = re.findall(r'>([^<]+)<', content)
131
+ clean = [t.strip() for t in matches if t.strip() and len(t.strip()) > 1]
132
+ if clean: text_parts.append(' '.join(clean))
133
+ except: continue
134
+ if text_parts:
135
+ return text_parts, None
136
+ return None, "HWPX ํ…์ŠคํŠธ ์—†์Œ"
137
+ except zipfile.BadZipFile:
138
+ return None, "์œ ํšจํ•˜์ง€ ์•Š์€ HWPX"
139
+ except Exception as e:
140
+ return None, f"HWPX ์˜ค๋ฅ˜: {e}"
141
+
142
+ def _decode_hwp_para(data):
143
+ """HWP ๋ฌธ๋‹จ ๋””์ฝ”๋”ฉ"""
144
+ result = []
145
+ i = 0
146
+ while i < len(data) - 1:
147
+ code = int.from_bytes(data[i:i+2], 'little')
148
+ if code in (1,2,3): i += 14
149
+ elif code == 9: result.append('\t')
150
+ elif code in (10,13): result.append('\n')
151
+ elif code == 24: result.append('-')
152
+ elif code in (30,31): result.append(' ')
153
+ elif code >= 32:
154
+ try:
155
+ ch = chr(code)
156
+ if ch.isprintable() or ch in '\n\t ': result.append(ch)
157
+ except: pass
158
+ i += 2
159
+ text = ''.join(result).strip()
160
+ text = re.sub(r'[ \t]+', ' ', text)
161
+ text = re.sub(r'\n{3,}', '\n\n', text)
162
+ return text if len(text) > 2 else None
163
+
164
+ def _extract_hwp_section(data):
165
+ """HWP ์„น์…˜ ์ถ”์ถœ"""
166
+ texts = []
167
+ pos = 0
168
+ while pos < len(data) - 4:
169
+ try:
170
+ header = int.from_bytes(data[pos:pos+4], 'little')
171
+ tag_id = header & 0x3FF
172
+ size = (header >> 20) & 0xFFF
173
+ pos += 4
174
+ if size == 0xFFF:
175
+ if pos + 4 > len(data): break
176
+ size = int.from_bytes(data[pos:pos+4], 'little')
177
+ pos += 4
178
+ if pos + size > len(data): break
179
+ record_data = data[pos:pos+size]
180
+ pos += size
181
+ if tag_id == 67 and size > 0:
182
+ t = _decode_hwp_para(record_data)
183
+ if t: texts.append(t)
184
+ except:
185
+ pos += 1
186
+ return '\n'.join(texts) if texts else None
187
+
188
+ def extract_text_from_hwp(file_path):
189
+ """HWP(๊ตฌํ˜•) โ†’ ํ…์ŠคํŠธ"""
190
+ if not HAS_OLEFILE: return None, "olefile ์—†์Œ"
191
+ try:
192
+ ole = olefile.OleFileIO(file_path)
193
+ if not ole.exists('FileHeader'):
194
+ ole.close(); return None, "HWP ํ—ค๋” ์—†์Œ"
195
+ header_data = ole.openstream('FileHeader').read()
196
+ is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
197
+ all_texts = []
198
+ for entry in ole.listdir():
199
+ entry_path = '/'.join(entry)
200
+ if 'Section' in entry_path and entry_path.endswith('_content.xml'):
201
+ try:
202
+ with ole.openstream(entry) as stream:
203
+ content = stream.read()
204
+ if is_compressed:
205
+ try:
206
+ content = zlib.decompress(content, -zlib.MAX_WBITS)
207
+ except: pass
208
+ t = _extract_hwp_section(content)
209
+ if t: all_texts.append(t)
210
+ except: pass
211
+ ole.close()
212
+ if all_texts:
213
+ return all_texts, None
214
+ return None, "HWP ํ…์ŠคํŠธ ์—†์Œ"
215
+ except Exception as e:
216
+ return None, f"HWP ์˜ค๋ฅ˜: {e}"
217
+
218
+ def extract_file_text_api(file_obj):
219
+ """ํŒŒ์ผ ๊ฐ์ฒด โ†’ ํ…์ŠคํŠธ"""
220
+ if not file_obj: return ""
221
+ fp = Path(file_obj.name)
222
+ suffix = fp.suffix.lower()
223
+ texts = None
224
+ error = None
225
+ if suffix == '.pdf':
226
+ texts, error = extract_text_from_pdf(str(fp))
227
+ elif suffix == '.docx':
228
+ texts, error = extract_text_from_docx(str(fp))
229
+ elif suffix in ['.txt', '.md', '.csv']:
230
+ texts, error = extract_text_from_txt(str(fp))
231
+ elif suffix == '.hwpx':
232
+ texts, error = extract_text_from_hwpx(str(fp))
233
+ elif suffix == '.hwp':
234
+ texts, error = extract_text_from_hwp(str(fp))
235
+ else:
236
+ texts, error = extract_text_from_txt(str(fp))
237
+ if error:
238
+ return f"โš ๏ธ {error}"
239
+ return '\n\n'.join(texts) if texts else "ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ"
240
+
241
+ # ============================================
242
+ # ๊ธฐ๋ณธ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ
243
+ # ============================================
244
+
245
+ def split_sentences(text):
246
+ """๋ฌธ์žฅ ๋ถ„๋ฆฌ"""
247
+ text = re.sub(r'\s+', ' ', text).strip()
248
+ sents = re.split(r'[.!?]+(?=\s|$)', text)
249
+ sents = [s.strip() for s in sents if s.strip()]
250
+ return sents
251
+
252
+ def split_words(text):
253
+ """๋‹จ์–ด ๋ถ„๋ฆฌ"""
254
+ return [w for w in re.findall(r'[๊ฐ€-ํžฃa-zA-Z0-9]+', text) if w]
255
+
256
+ # ============================================
257
+ # HTTP ํ—ฌํผ
258
+ # ============================================
259
+
260
+ def http_get(url, headers=None, timeout=10):
261
+ """HTTP GET"""
262
+ if HAS_HTTPX:
263
+ try:
264
+ r = httpx.get(url, headers=headers, timeout=timeout)
265
+ return r.text if r.status_code == 200 else None
266
+ except: return None
267
+ return None
268
+
269
+ # ============================================
270
+ # ์›น ๊ฒ€์ƒ‰ ํ•จ์ˆ˜๋“ค
271
+ # ============================================
272
+
273
+ def brave_search(query, count=5):
274
+ """Brave Search API"""
275
+ BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
276
+ if not BRAVE_KEY: return []
277
+ url = f"https://api.search.brave.com/res/v1/web/search?q={query}&count={count}"
278
+ try:
279
+ if HAS_HTTPX:
280
+ r = httpx.get(url, headers={"X-Subscription-Token": BRAVE_KEY, "Accept": "application/json"}, timeout=10)
281
+ if r.status_code == 200:
282
+ data = r.json()
283
+ results = []
284
+ for item in data.get("web", {}).get("results", []):
285
+ results.append({"title": item.get("title",""), "url": item.get("url",""), "snippet": item.get("description",""), "source": "Brave"})
286
+ return results
287
+ except: pass
288
+ return []
289
+
290
+ def search_kci(query):
291
+ """KCI ๊ฒ€์ƒ‰"""
292
+ try:
293
+ url = f"https://open.kci.go.kr/po/openapi/openApiSearch.kci?apiCode=articleSearch&title={query}&displayCount=3"
294
+ resp = http_get(url, timeout=8)
295
+ if resp:
296
+ results = []
297
+ for m in re.finditer(r'<article-title><!\[CDATA\[(.+?)\]\]></article-title>.*?<url><!\[CDATA\[(.+?)\]\]></url>', resp, re.S):
298
+ results.append({"title": m.group(1), "url": m.group(2), "snippet": "", "source": "KCI"})
299
+ return results[:3]
300
+ except: pass
301
+ return []
302
+
303
+ def search_riss(query):
304
+ """RISS ๊ฒ€์ƒ‰"""
305
+ results = []
306
+ try:
307
+ url = f"http://www.riss.kr/search/Search.do?isDetailSearch=N&searchGubun=true&viewYn=OP&queryText=&strQuery={query}&iStartCount=0&iGroupView=5&icate=all"
308
+ resp = http_get(url, timeout=8)
309
+ if resp:
310
+ for m in re.finditer(r'class="title"[^>]*>.*?<a[^>]*href="([^"]+)"[^>]*>(.*?)</a>', resp, re.S):
311
+ title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
312
+ if title:
313
+ results.append({"title": title, "url": "https://www.riss.kr" + m.group(1), "snippet": "", "source": "RISS"})
314
+ except: pass
315
+ return results[:3]
316
+
317
+ def search_arxiv(query):
318
+ """arXiv ๊ฒ€์ƒ‰"""
319
+ results = []
320
+ try:
321
+ import urllib.parse
322
+ q = urllib.parse.quote(query)
323
+ url = f"https://export.arxiv.org/api/query?search_query=all:{q}&start=0&max_results=3&sortBy=relevance"
324
+ resp = http_get(url, timeout=12)
325
+ if resp:
326
+ for m in re.finditer(r'<entry>.*?<title>(.*?)</title>.*?<id>(.*?)</id>.*?<summary>(.*?)</summary>', resp, re.S):
327
+ title = re.sub(r'\s+', ' ', m.group(1)).strip()
328
+ results.append({"title": title, "url": m.group(2).strip(), "snippet": re.sub(r'\s+', ' ', m.group(3)).strip()[:150], "source": "arXiv"})
329
+ except: pass
330
+ return results[:3]
331
+
332
+ def duckduckgo_search(query, max_results=5):
333
+ """DuckDuckGo ๊ฒ€์ƒ‰"""
334
+ results = []
335
+ try:
336
+ import urllib.parse
337
+ q = urllib.parse.quote(query)
338
+ url = f"https://html.duckduckgo.com/html/?q={q}"
339
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
340
+ resp = http_get(url, headers=headers, timeout=10)
341
+ if resp:
342
+ for m in re.finditer(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>(.*?)</a>.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>', resp, re.S):
343
+ href = m.group(1)
344
+ title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
345
+ snippet = re.sub(r'<[^>]+>', '', m.group(3)).strip()
346
+ real_url = href
347
+ if 'uddg=' in href:
348
+ um = re.search(r'uddg=([^&]+)', href)
349
+ if um: real_url = urllib.parse.unquote(um.group(1))
350
+ if title:
351
+ results.append({"title": title, "url": real_url, "snippet": snippet, "source": "Web"})
352
+ if len(results) >= max_results: break
353
+ except: pass
354
+ return results
355
+
356
+ def self_crawl_search(query, max_results=3):
357
+ """DuckDuckGo ํฌ๋กค๋ง"""
358
+ all_results = []
359
+ all_results.extend(duckduckgo_search(query, max_results))
360
+ if '๋…ผ๋ฌธ' not in query and 'paper' not in query.lower():
361
+ all_results.extend(duckduckgo_search(f"{query} ๋…ผ๋ฌธ ํ•™์ˆ ", 2))
362
+ return all_results
363
+
364
+ def parallel_brave_search(queries, max_workers=10):
365
+ """Brave Search ๋ณ‘๋ ฌ ์‹คํ–‰"""
366
+ all_results = {}
367
+ with ThreadPoolExecutor(max_workers=min(max_workers, 20)) as executor:
368
+ futures = {executor.submit(brave_search, q, 3): q for q in queries}
369
+ for future in as_completed(futures):
370
+ q = futures[future]
371
+ try:
372
+ results = future.result()
373
+ all_results[q] = results
374
+ except: all_results[q] = []
375
+ return all_results