Spaces:

VIDraft
/

TeXray-backup

Sleeping

App Files Files Community

openfree commited on Mar 1

Commit

cca699c

verified ·

1 Parent(s): 7156e9b

Upload text_utils.py

Browse files

Files changed (1) hide show

text_utils.py +375 -0

text_utils.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# ============================================
+# text_utils.py
+# 파일 추출, 웹 검색, 기본 텍스트 처리 함수들
+# ============================================
+import re, os, json, time, zipfile, tempfile, zlib
+from pathlib import Path
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from xml.etree import ElementTree as ET
+try:
+    import httpx
+    HAS_HTTPX = True
+except ImportError:
+    HAS_HTTPX = False
+try:
+    import pdfplumber
+    HAS_PDFPLUMBER = True
+except ImportError:
+    HAS_PDFPLUMBER = False
+try:
+    import PyPDF2
+    HAS_PYPDF2 = True
+except ImportError:
+    HAS_PYPDF2 = False
+try:
+    from docx import Document as DocxDocument
+    HAS_DOCX = True
+except ImportError:
+    HAS_DOCX = False
+try:
+    import olefile
+    HAS_OLEFILE = True
+except ImportError:
+    HAS_OLEFILE = False
+# ============================================
+# 파일 추출 함수들
+# ============================================
+def extract_text_from_pdf(file_path):
+    """PDF → 텍스트"""
+    pages = []
+    if HAS_PDFPLUMBER:
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                for p in pdf.pages:
+                    t = p.extract_text()
+                    if t: pages.append(t)
+            if pages: return pages, None
+        except Exception as e:
+            print(f"pdfplumber: {e}")
+    if HAS_PYPDF2:
+        try:
+            with open(file_path, 'rb') as f:
+                reader = PyPDF2.PdfReader(f)
+                for p in reader.pages:
+                    t = p.extract_text()
+                    if t: pages.append(t)
+            if pages: return pages, None
+        except Exception as e:
+            print(f"PyPDF2: {e}")
+    return None, "PDF 추출 실패"
+def extract_text_from_docx(file_path):
+    """DOCX → 텍스트"""
+    if not HAS_DOCX: return None, "python-docx 없음"
+    try:
+        doc = DocxDocument(file_path)
+        sections = []
+        current = []
+        for para in doc.paragraphs:
+            txt = para.text.strip()
+            if not txt:
+                if current:
+                    sections.append('\n'.join(current))
+                    current = []
+            else:
+                current.append(txt)
+        if current: sections.append('\n'.join(current))
+        if sections: return sections, None
+        return None, "DOCX 텍스트 없음"
+    except Exception as e:
+        return None, f"DOCX 오류: {e}"
+def extract_text_from_txt(file_path):
+    """TXT/MD/CSV 등"""
+    for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
+        try:
+            with open(file_path, 'r', encoding=enc) as f:
+                text = f.read()
+            if text.strip():
+                sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
+                return sections if sections else [text], None
+        except: continue
+    return None, "텍스트 인코딩 실패"
+def extract_text_from_hwpx(file_path):
+    """HWPX(한글 2007 이상) → 텍스트"""
+    try:
+        text_parts = []
+        with zipfile.ZipFile(file_path, 'r') as zf:
+            file_list = zf.namelist()
+            section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
+            if not section_files:
+                section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
+            for sf_name in section_files:
+                try:
+                    with zf.open(sf_name) as sf:
+                        content = sf.read().decode('utf-8', errors='ignore')
+                        content = re.sub(r'\sxmlns[^"]*"[^"]*"', '', content)
+                        content = re.sub(r'<[a-zA-Z]+:', '<', content)
+                        content = re.sub(r'</[a-zA-Z]+:', '</', content)
+                        try:
+                            root = ET.fromstring(content)
+                            texts = []
+                            for elem in root.iter():
+                                if elem.tag.endswith('t') or elem.tag == 't':
+                                    if elem.text: texts.append(elem.text)
+                                elif elem.text and elem.text.strip():
+                                    if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
+                                        texts.append(elem.text.strip())
+                            if texts: text_parts.append(' '.join(texts))
+                        except ET.ParseError:
+                            matches = re.findall(r'>([^<]+)<', content)
+                            clean = [t.strip() for t in matches if t.strip() and len(t.strip()) > 1]
+                            if clean: text_parts.append(' '.join(clean))
+                except: continue
+        if text_parts:
+            return text_parts, None
+        return None, "HWPX 텍스트 없음"
+    except zipfile.BadZipFile:
+        return None, "유효하지 않은 HWPX"
+    except Exception as e:
+        return None, f"HWPX 오류: {e}"
+def _decode_hwp_para(data):
+    """HWP 문단 디코딩"""
+    result = []
+    i = 0
+    while i < len(data) - 1:
+        code = int.from_bytes(data[i:i+2], 'little')
+        if code in (1,2,3): i += 14
+        elif code == 9: result.append('\t')
+        elif code in (10,13): result.append('\n')
+        elif code == 24: result.append('-')
+        elif code in (30,31): result.append(' ')
+        elif code >= 32:
+            try:
+                ch = chr(code)
+                if ch.isprintable() or ch in '\n\t ': result.append(ch)
+            except: pass
+        i += 2
+    text = ''.join(result).strip()
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text if len(text) > 2 else None
+def _extract_hwp_section(data):
+    """HWP 섹션 추출"""
+    texts = []
+    pos = 0
+    while pos < len(data) - 4:
+        try:
+            header = int.from_bytes(data[pos:pos+4], 'little')
+            tag_id = header & 0x3FF
+            size = (header >> 20) & 0xFFF
+            pos += 4
+            if size == 0xFFF:
+                if pos + 4 > len(data): break
+                size = int.from_bytes(data[pos:pos+4], 'little')
+                pos += 4
+            if pos + size > len(data): break
+            record_data = data[pos:pos+size]
+            pos += size
+            if tag_id == 67 and size > 0:
+                t = _decode_hwp_para(record_data)
+                if t: texts.append(t)
+        except:
+            pos += 1
+    return '\n'.join(texts) if texts else None
+def extract_text_from_hwp(file_path):
+    """HWP(구형) → 텍스트"""
+    if not HAS_OLEFILE: return None, "olefile 없음"
+    try:
+        ole = olefile.OleFileIO(file_path)
+        if not ole.exists('FileHeader'):
+            ole.close(); return None, "HWP 헤더 없음"
+        header_data = ole.openstream('FileHeader').read()
+        is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
+        all_texts = []
+        for entry in ole.listdir():
+            entry_path = '/'.join(entry)
+            if 'Section' in entry_path and entry_path.endswith('_content.xml'):
+                try:
+                    with ole.openstream(entry) as stream:
+                        content = stream.read()
+                        if is_compressed:
+                            try:
+                                content = zlib.decompress(content, -zlib.MAX_WBITS)
+                            except: pass
+                        t = _extract_hwp_section(content)
+                        if t: all_texts.append(t)
+                except: pass
+        ole.close()
+        if all_texts:
+            return all_texts, None
+        return None, "HWP 텍스트 없음"
+    except Exception as e:
+        return None, f"HWP 오류: {e}"
+def extract_file_text_api(file_obj):
+    """파일 객체 → 텍스트"""
+    if not file_obj: return ""
+    fp = Path(file_obj.name)
+    suffix = fp.suffix.lower()
+    texts = None
+    error = None
+    if suffix == '.pdf':
+        texts, error = extract_text_from_pdf(str(fp))
+    elif suffix == '.docx':
+        texts, error = extract_text_from_docx(str(fp))
+    elif suffix in ['.txt', '.md', '.csv']:
+        texts, error = extract_text_from_txt(str(fp))
+    elif suffix == '.hwpx':
+        texts, error = extract_text_from_hwpx(str(fp))
+    elif suffix == '.hwp':
+        texts, error = extract_text_from_hwp(str(fp))
+    else:
+        texts, error = extract_text_from_txt(str(fp))
+    if error:
+        return f"⚠️ {error}"
+    return '\n\n'.join(texts) if texts else "텍스트 추출 실패"
+# ============================================
+# 기본 텍스트 처리
+# ============================================
+def split_sentences(text):
+    """문장 분리"""
+    text = re.sub(r'\s+', ' ', text).strip()
+    sents = re.split(r'[.!?]+(?=\s|$)', text)
+    sents = [s.strip() for s in sents if s.strip()]
+    return sents
+def split_words(text):
+    """단어 분리"""
+    return [w for w in re.findall(r'[가-힣a-zA-Z0-9]+', text) if w]
+# ============================================
+# HTTP 헬퍼
+# ============================================
+def http_get(url, headers=None, timeout=10):
+    """HTTP GET"""
+    if HAS_HTTPX:
+        try:
+            r = httpx.get(url, headers=headers, timeout=timeout)
+            return r.text if r.status_code == 200 else None
+        except: return None
+    return None
+# ============================================
+# 웹 검색 함수들
+# ============================================
+def brave_search(query, count=5):
+    """Brave Search API"""
+    BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
+    if not BRAVE_KEY: return []
+    url = f"https://api.search.brave.com/res/v1/web/search?q={query}&count={count}"
+    try:
+        if HAS_HTTPX:
+            r = httpx.get(url, headers={"X-Subscription-Token": BRAVE_KEY, "Accept": "application/json"}, timeout=10)
+            if r.status_code == 200:
+                data = r.json()
+                results = []
+                for item in data.get("web", {}).get("results", []):
+                    results.append({"title": item.get("title",""), "url": item.get("url",""), "snippet": item.get("description",""), "source": "Brave"})
+                return results
+    except: pass
+    return []
+def search_kci(query):
+    """KCI 검색"""
+    try:
+        url = f"https://open.kci.go.kr/po/openapi/openApiSearch.kci?apiCode=articleSearch&title={query}&displayCount=3"
+        resp = http_get(url, timeout=8)
+        if resp:
+            results = []
+            for m in re.finditer(r'<article-title><!\[CDATA\[(.+?)\]\]></article-title>.*?<url><!\[CDATA\[(.+?)\]\]></url>', resp, re.S):
+                results.append({"title": m.group(1), "url": m.group(2), "snippet": "", "source": "KCI"})
+            return results[:3]
+    except: pass
+    return []
+def search_riss(query):
+    """RISS 검색"""
+    results = []
+    try:
+        url = f"http://www.riss.kr/search/Search.do?isDetailSearch=N&searchGubun=true&viewYn=OP&queryText=&strQuery={query}&iStartCount=0&iGroupView=5&icate=all"
+        resp = http_get(url, timeout=8)
+        if resp:
+            for m in re.finditer(r'class="title"[^>]*>.*?<a[^>]*href="([^"]+)"[^>]*>(.*?)</a>', resp, re.S):
+                title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
+                if title:
+                    results.append({"title": title, "url": "https://www.riss.kr" + m.group(1), "snippet": "", "source": "RISS"})
+    except: pass
+    return results[:3]
+def search_arxiv(query):
+    """arXiv 검색"""
+    results = []
+    try:
+        import urllib.parse
+        q = urllib.parse.quote(query)
+        url = f"https://export.arxiv.org/api/query?search_query=all:{q}&start=0&max_results=3&sortBy=relevance"
+        resp = http_get(url, timeout=12)
+        if resp:
+            for m in re.finditer(r'<entry>.*?<title>(.*?)</title>.*?<id>(.*?)</id>.*?<summary>(.*?)</summary>', resp, re.S):
+                title = re.sub(r'\s+', ' ', m.group(1)).strip()
+                results.append({"title": title, "url": m.group(2).strip(), "snippet": re.sub(r'\s+', ' ', m.group(3)).strip()[:150], "source": "arXiv"})
+    except: pass
+    return results[:3]
+def duckduckgo_search(query, max_results=5):
+    """DuckDuckGo 검색"""
+    results = []
+    try:
+        import urllib.parse
+        q = urllib.parse.quote(query)
+        url = f"https://html.duckduckgo.com/html/?q={q}"
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
+        resp = http_get(url, headers=headers, timeout=10)
+        if resp:
+            for m in re.finditer(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>(.*?)</a>.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>', resp, re.S):
+                href = m.group(1)
+                title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
+                snippet = re.sub(r'<[^>]+>', '', m.group(3)).strip()
+                real_url = href
+                if 'uddg=' in href:
+                    um = re.search(r'uddg=([^&]+)', href)
+                    if um: real_url = urllib.parse.unquote(um.group(1))
+                if title:
+                    results.append({"title": title, "url": real_url, "snippet": snippet, "source": "Web"})
+                if len(results) >= max_results: break
+    except: pass
+    return results
+def self_crawl_search(query, max_results=3):
+    """DuckDuckGo 크롤링"""
+    all_results = []
+    all_results.extend(duckduckgo_search(query, max_results))
+    if '논문' not in query and 'paper' not in query.lower():
+        all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2))
+    return all_results
+def parallel_brave_search(queries, max_workers=10):
+    """Brave Search 병렬 실행"""
+    all_results = {}
+    with ThreadPoolExecutor(max_workers=min(max_workers, 20)) as executor:
+        futures = {executor.submit(brave_search, q, 3): q for q in queries}
+        for future in as_completed(futures):
+            q = futures[future]
+            try:
+                results = future.result()
+                all_results[q] = results
+            except: all_results[q] = []
+    return all_results