File size: 2,589 Bytes
8bd9348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from __future__ import annotations
import uuid, pathlib, logging
from typing import List, Dict, Any
from pypdf import PdfReader
import trafilatura
from .utils import Doc, normalize_text

# Silence noisy pypdf warnings from malformed PDFs
logging.getLogger("pypdf").setLevel(logging.ERROR)

def read_txt(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def read_pdf(path: str) -> str:
    text = []
    reader = PdfReader(path)
    for page in reader.pages:
        text.append(page.extract_text() or "")
    return "\n".join(text)

def read_any(path: str) -> str:
    ext = pathlib.Path(path).suffix.lower()
    if ext in [".txt", ".md"]:
        return read_txt(path)
    elif ext in [".pdf"]:
        return read_pdf(path)
    else:
        return read_txt(path)

def fetch_url(url: str) -> str:
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return ""
    return trafilatura.extract(downloaded) or ""

def split_to_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
    words = text.split()
    if not words:
        return []
    chunks = []
    i = 0
    step = max(1, chunk_size - overlap)
    while i < len(words):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
        i += step
    return chunks or [text]

def guess_coin(label: str) -> str:
    low = label.lower()
    if "bitcoin" in low or "btc" in low: return "bitcoin"
    if "ethereum" in low or "eth" in low: return "ethereum"
    return ""

def build_docs_from_paths(paths: List[str], source_label: str = "local") -> List[Doc]:
    docs: List[Doc] = []
    for p in paths or []:
        raw = read_any(p)
        if not raw: 
            continue
        coin = guess_coin(p)
        for i, chunk in enumerate(split_to_chunks(raw)):
            docs.append(Doc(
                id=f"{uuid.uuid4()}",
                text=normalize_text(chunk),
                metadata={"source": source_label, "path": p, "chunk": i, "coin": coin}
            ))
    return docs

def build_docs_from_urls(urls: List[str], source_label: str = "web") -> List[Doc]:
    docs: List[Doc] = []
    for u in urls or []:
        raw = fetch_url(u)
        if not raw: 
            continue
        coin = guess_coin(u)
        for i, chunk in enumerate(split_to_chunks(raw)):
            docs.append(Doc(
                id=f"{uuid.uuid4()}",
                text=normalize_text(chunk),
                metadata={"source": source_label, "url": u, "chunk": i, "coin": coin}
            ))
    return docs