Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import uuid, pathlib, logging | |
| from typing import List, Dict, Any | |
| from pypdf import PdfReader | |
| import trafilatura | |
| from .utils import Doc, normalize_text | |
| # Silence noisy pypdf warnings from malformed PDFs | |
| logging.getLogger("pypdf").setLevel(logging.ERROR) | |
| def read_txt(path: str) -> str: | |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
| return f.read() | |
| def read_pdf(path: str) -> str: | |
| text = [] | |
| reader = PdfReader(path) | |
| for page in reader.pages: | |
| text.append(page.extract_text() or "") | |
| return "\n".join(text) | |
| def read_any(path: str) -> str: | |
| ext = pathlib.Path(path).suffix.lower() | |
| if ext in [".txt", ".md"]: | |
| return read_txt(path) | |
| elif ext in [".pdf"]: | |
| return read_pdf(path) | |
| else: | |
| return read_txt(path) | |
| def fetch_url(url: str) -> str: | |
| downloaded = trafilatura.fetch_url(url) | |
| if not downloaded: | |
| return "" | |
| return trafilatura.extract(downloaded) or "" | |
| def split_to_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]: | |
| words = text.split() | |
| if not words: | |
| return [] | |
| chunks = [] | |
| i = 0 | |
| step = max(1, chunk_size - overlap) | |
| while i < len(words): | |
| chunk = " ".join(words[i:i+chunk_size]) | |
| chunks.append(chunk) | |
| i += step | |
| return chunks or [text] | |
| def guess_coin(label: str) -> str: | |
| low = label.lower() | |
| if "bitcoin" in low or "btc" in low: return "bitcoin" | |
| if "ethereum" in low or "eth" in low: return "ethereum" | |
| return "" | |
| def build_docs_from_paths(paths: List[str], source_label: str = "local") -> List[Doc]: | |
| docs: List[Doc] = [] | |
| for p in paths or []: | |
| raw = read_any(p) | |
| if not raw: | |
| continue | |
| coin = guess_coin(p) | |
| for i, chunk in enumerate(split_to_chunks(raw)): | |
| docs.append(Doc( | |
| id=f"{uuid.uuid4()}", | |
| text=normalize_text(chunk), | |
| metadata={"source": source_label, "path": p, "chunk": i, "coin": coin} | |
| )) | |
| return docs | |
| def build_docs_from_urls(urls: List[str], source_label: str = "web") -> List[Doc]: | |
| docs: List[Doc] = [] | |
| for u in urls or []: | |
| raw = fetch_url(u) | |
| if not raw: | |
| continue | |
| coin = guess_coin(u) | |
| for i, chunk in enumerate(split_to_chunks(raw)): | |
| docs.append(Doc( | |
| id=f"{uuid.uuid4()}", | |
| text=normalize_text(chunk), | |
| metadata={"source": source_label, "url": u, "chunk": i, "coin": coin} | |
| )) | |
| return docs | |