Spaces:
Sleeping
Sleeping
File size: 2,589 Bytes
8bd9348 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
from __future__ import annotations
import uuid, pathlib, logging
from typing import List, Dict, Any
from pypdf import PdfReader
import trafilatura
from .utils import Doc, normalize_text
# Silence noisy pypdf warnings from malformed PDFs
logging.getLogger("pypdf").setLevel(logging.ERROR)
def read_txt(path: str) -> str:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def read_pdf(path: str) -> str:
text = []
reader = PdfReader(path)
for page in reader.pages:
text.append(page.extract_text() or "")
return "\n".join(text)
def read_any(path: str) -> str:
ext = pathlib.Path(path).suffix.lower()
if ext in [".txt", ".md"]:
return read_txt(path)
elif ext in [".pdf"]:
return read_pdf(path)
else:
return read_txt(path)
def fetch_url(url: str) -> str:
downloaded = trafilatura.fetch_url(url)
if not downloaded:
return ""
return trafilatura.extract(downloaded) or ""
def split_to_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
words = text.split()
if not words:
return []
chunks = []
i = 0
step = max(1, chunk_size - overlap)
while i < len(words):
chunk = " ".join(words[i:i+chunk_size])
chunks.append(chunk)
i += step
return chunks or [text]
def guess_coin(label: str) -> str:
low = label.lower()
if "bitcoin" in low or "btc" in low: return "bitcoin"
if "ethereum" in low or "eth" in low: return "ethereum"
return ""
def build_docs_from_paths(paths: List[str], source_label: str = "local") -> List[Doc]:
docs: List[Doc] = []
for p in paths or []:
raw = read_any(p)
if not raw:
continue
coin = guess_coin(p)
for i, chunk in enumerate(split_to_chunks(raw)):
docs.append(Doc(
id=f"{uuid.uuid4()}",
text=normalize_text(chunk),
metadata={"source": source_label, "path": p, "chunk": i, "coin": coin}
))
return docs
def build_docs_from_urls(urls: List[str], source_label: str = "web") -> List[Doc]:
docs: List[Doc] = []
for u in urls or []:
raw = fetch_url(u)
if not raw:
continue
coin = guess_coin(u)
for i, chunk in enumerate(split_to_chunks(raw)):
docs.append(Doc(
id=f"{uuid.uuid4()}",
text=normalize_text(chunk),
metadata={"source": source_label, "url": u, "chunk": i, "coin": coin}
))
return docs
|