Nullpointer-KK commited on
Commit
8bd9348
·
1 Parent(s): 2fcbd3b
rag/__pycache__/ingest.cpython-313.pyc ADDED
Binary file (4.48 kB). View file
 
rag/__pycache__/pipeline.cpython-313.pyc ADDED
Binary file (6.35 kB). View file
 
rag/__pycache__/tools.cpython-313.pyc ADDED
Binary file (2.7 kB). View file
 
rag/__pycache__/utils.cpython-313.pyc ADDED
Binary file (8.56 kB). View file
 
rag/ingest.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import uuid, pathlib, logging
3
+ from typing import List, Dict, Any
4
+ from pypdf import PdfReader
5
+ import trafilatura
6
+ from .utils import Doc, normalize_text
7
+
8
+ # Silence noisy pypdf warnings from malformed PDFs
9
+ logging.getLogger("pypdf").setLevel(logging.ERROR)
10
+
11
+ def read_txt(path: str) -> str:
12
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
13
+ return f.read()
14
+
15
+ def read_pdf(path: str) -> str:
16
+ text = []
17
+ reader = PdfReader(path)
18
+ for page in reader.pages:
19
+ text.append(page.extract_text() or "")
20
+ return "\n".join(text)
21
+
22
+ def read_any(path: str) -> str:
23
+ ext = pathlib.Path(path).suffix.lower()
24
+ if ext in [".txt", ".md"]:
25
+ return read_txt(path)
26
+ elif ext in [".pdf"]:
27
+ return read_pdf(path)
28
+ else:
29
+ return read_txt(path)
30
+
31
+ def fetch_url(url: str) -> str:
32
+ downloaded = trafilatura.fetch_url(url)
33
+ if not downloaded:
34
+ return ""
35
+ return trafilatura.extract(downloaded) or ""
36
+
37
+ def split_to_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
38
+ words = text.split()
39
+ if not words:
40
+ return []
41
+ chunks = []
42
+ i = 0
43
+ step = max(1, chunk_size - overlap)
44
+ while i < len(words):
45
+ chunk = " ".join(words[i:i+chunk_size])
46
+ chunks.append(chunk)
47
+ i += step
48
+ return chunks or [text]
49
+
50
+ def guess_coin(label: str) -> str:
51
+ low = label.lower()
52
+ if "bitcoin" in low or "btc" in low: return "bitcoin"
53
+ if "ethereum" in low or "eth" in low: return "ethereum"
54
+ return ""
55
+
56
+ def build_docs_from_paths(paths: List[str], source_label: str = "local") -> List[Doc]:
57
+ docs: List[Doc] = []
58
+ for p in paths or []:
59
+ raw = read_any(p)
60
+ if not raw:
61
+ continue
62
+ coin = guess_coin(p)
63
+ for i, chunk in enumerate(split_to_chunks(raw)):
64
+ docs.append(Doc(
65
+ id=f"{uuid.uuid4()}",
66
+ text=normalize_text(chunk),
67
+ metadata={"source": source_label, "path": p, "chunk": i, "coin": coin}
68
+ ))
69
+ return docs
70
+
71
+ def build_docs_from_urls(urls: List[str], source_label: str = "web") -> List[Doc]:
72
+ docs: List[Doc] = []
73
+ for u in urls or []:
74
+ raw = fetch_url(u)
75
+ if not raw:
76
+ continue
77
+ coin = guess_coin(u)
78
+ for i, chunk in enumerate(split_to_chunks(raw)):
79
+ docs.append(Doc(
80
+ id=f"{uuid.uuid4()}",
81
+ text=normalize_text(chunk),
82
+ metadata={"source": source_label, "url": u, "chunk": i, "coin": coin}
83
+ ))
84
+ return docs
rag/pipeline.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import List, Dict, Any
3
+
4
+ from openai import OpenAI
5
+ from .utils import HybridIndex, Reranker, Doc, select_fewshots
6
+ from .ingest import build_docs_from_paths, build_docs_from_urls
7
+ from prompts import SYSTEM_PROMPT, FEWSHOTS
8
+
9
+ class CryptoRAGPipeline:
10
+ def __init__(self, dense_model: str = "sentence-transformers/all-MiniLM-L6-v2", reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
11
+ self.index = HybridIndex(dense_model_name=dense_model)
12
+ self.reranker = Reranker(reranker_model)
13
+ self.client: OpenAI | None = None
14
+
15
+ def set_openai(self, api_key: str):
16
+ self.client = OpenAI(api_key=api_key)
17
+
18
+ def add_local_files(self, paths: List[str]):
19
+ docs = build_docs_from_paths(paths, source_label="local")
20
+ self.index.add(docs)
21
+
22
+ def add_urls(self, urls: List[str]):
23
+ docs = build_docs_from_urls(urls, source_label="web")
24
+ self.index.add(docs)
25
+
26
+ def build(self):
27
+ self.index.build()
28
+
29
+ def route(self, query: str) -> str:
30
+ q = query.lower()
31
+ if any(k in q for k in ["price", "market cap", "marketcap", "ath", "all-time high", "24h", "fear greed", "greed index"]):
32
+ return "tools"
33
+ return "rag"
34
+
35
+ def build_prompt(self, query: str, contexts: List[Doc]) -> str:
36
+ fs = select_fewshots(query, FEWSHOTS, self.index.embedder, n=2)
37
+ few = "\n\n".join([f"Q: {x['q']}\nA: {x['a']}" for x in fs])
38
+ ctx = "\n\n".join([f"[{i+1}] {c.text[:1200]}" for i, c in enumerate(contexts)])
39
+ prompt = f"""{SYSTEM_PROMPT}
40
+
41
+ Few-shot examples:
42
+ {few}
43
+
44
+ Context (use to answer if relevant; cite [#]):
45
+ {ctx}
46
+
47
+ User question: {query}
48
+
49
+ Answer:"""
50
+ return prompt
51
+
52
+ def answer_stream(self, query: str, contexts: List[Doc], model: str = "gpt-4o-mini"):
53
+ assert self.client is not None, "LLM client not set"
54
+ prompt = self.build_prompt(query, contexts)
55
+ with self.client.chat.completions.create(
56
+ model=model,
57
+ messages=[{"role":"system","content":SYSTEM_PROMPT},
58
+ {"role":"user","content":prompt}],
59
+ stream=True,
60
+ temperature=0.3,
61
+ max_tokens=400
62
+ ) as stream:
63
+ for event in stream:
64
+ if hasattr(event, "choices") and event.choices:
65
+ delta = event.choices[0].delta
66
+ if delta and delta.content:
67
+ yield delta.content
68
+
69
+ def ask(self, query: str, k: int = 8, alpha: float = 0.5, top_k_rerank: int = 5, filters: Dict[str, Any] | None = None, stream: bool = True):
70
+ route = self.route(query)
71
+ if route == "tools":
72
+ return {"route": "tools", "contexts": []}
73
+
74
+ # Try auto-build if needed
75
+ if not self.index.ready():
76
+ self.index.build()
77
+ if not self.index.ready():
78
+ return {"route": "not_ready", "contexts": [], "reason": "index_empty" if len(self.index.docs)==0 else "build_failed"}
79
+
80
+ hits = self.index.search(query, k=k, alpha=alpha, filters=filters)
81
+ if not hits:
82
+ return {"route": "not_ready", "contexts": [], "reason": "no_results"}
83
+
84
+ reranked = self.reranker.rerank(query, hits, top_k=top_k_rerank)
85
+ top_contexts = [d for d,_ in reranked]
86
+ return {"route": "rag", "contexts": top_contexts}
rag/tools.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import requests
3
+
4
+ # Minimal map from common names/symbols → CoinGecko IDs
5
+ COIN_MAP = {
6
+ "btc": "bitcoin", "bitcoin": "bitcoin",
7
+ "eth": "ethereum", "ethereum": "ethereum",
8
+ "sol": "solana", "solana": "solana",
9
+ "xrp": "ripple", "ripple": "ripple",
10
+ }
11
+
12
+ def resolve_coin_id(text_or_symbol: str, default: str = "bitcoin") -> str:
13
+ t = (text_or_symbol or "").lower().strip()
14
+ # try exact-in-text matches first (longest keys first)
15
+ for key in sorted(COIN_MAP.keys(), key=len, reverse=True):
16
+ if key in t.split() or key in t:
17
+ return COIN_MAP[key]
18
+ return default
19
+
20
+ def get_price(coin_id: str = "bitcoin", vs: str = "usd"):
21
+ url = f"https://api.coingecko.com/api/v3/simple/price?ids={coin_id}&vs_currencies={vs}"
22
+ r = requests.get(url, timeout=10)
23
+ r.raise_for_status()
24
+ data = r.json()
25
+ return data.get(coin_id, {}).get(vs)
26
+
27
+ def get_price_any(coin_or_query: str, vs: str = "usd"):
28
+ coin_id = resolve_coin_id(coin_or_query)
29
+ return coin_id, get_price(coin_id, vs)
30
+
31
+ def get_price_multi(coin_ids: list[str], vs: str = "usd") -> dict:
32
+ # Efficient batch call (one request) e.g. ["bitcoin","ethereum","solana","ripple"]
33
+ unique = ",".join(sorted(set(coin_ids)))
34
+ url = f"https://api.coingecko.com/api/v3/simple/price?ids={unique}&vs_currencies={vs}"
35
+ r = requests.get(url, timeout=10)
36
+ r.raise_for_status()
37
+ return r.json()
38
+
39
+ def get_fear_greed():
40
+ url = "https://api.alternative.me/fng/"
41
+ r = requests.get(url, timeout=10)
42
+ r.raise_for_status()
43
+ data = r.json()
44
+ if data.get("data"):
45
+ return data["data"][0]
46
+ return None
rag/utils.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import re, json, hashlib
3
+ from dataclasses import dataclass
4
+ from typing import List, Dict, Any, Tuple
5
+
6
+ import numpy as np
7
+ from rank_bm25 import BM25Okapi
8
+ from sentence_transformers import SentenceTransformer, CrossEncoder
9
+
10
+ def cache_key(obj: Any) -> str:
11
+ return hashlib.sha256(json.dumps(obj, sort_keys=True).encode()).hexdigest()
12
+
13
+ def normalize_text(s: str) -> str:
14
+ return re.sub(r"\s+", " ", s).strip()
15
+
16
+ @dataclass
17
+ class Doc:
18
+ id: str
19
+ text: str
20
+ metadata: Dict[str, Any]
21
+
22
+ class HybridIndex:
23
+ def __init__(self, dense_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
24
+ self.dense_model_name = dense_model_name
25
+ self.embedder = SentenceTransformer(dense_model_name)
26
+ self.docs: List[Doc] = []
27
+ self.bm25 = None
28
+ self.embeddings = None
29
+
30
+ def add(self, docs: List[Doc]):
31
+ self.docs.extend(docs)
32
+
33
+ def build(self):
34
+ # Build only if we have docs
35
+ if not self.docs:
36
+ self.bm25, self.embeddings = None, None
37
+ return
38
+ corpus = [d.text for d in self.docs]
39
+ tokenized = [c.split() for c in corpus]
40
+ self.bm25 = BM25Okapi(tokenized)
41
+ self.embeddings = self.embedder.encode(
42
+ corpus, convert_to_numpy=True, normalize_embeddings=True
43
+ )
44
+
45
+ def ready(self) -> bool:
46
+ return (self.bm25 is not None) and (self.embeddings is not None) and (len(self.docs) > 0)
47
+
48
+ def search(self, query: str, k: int = 8, alpha: float = 0.5, filters: Dict[str, Any] | None = None):
49
+ # If index isn't ready, return empty (UI/pipeline should guide the user)
50
+ if not self.ready():
51
+ return []
52
+
53
+ # Dense embedding for query
54
+ query_vec = self.embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
55
+
56
+ # BM25 + dense scores
57
+ q_tokens = query.split()
58
+ try:
59
+ bm25_scores = self.bm25.get_scores(q_tokens)
60
+ except Exception:
61
+ # Fallback if BM25 hiccups (e.g., empty tokens)
62
+ bm25_scores = np.zeros(len(self.docs), dtype=float)
63
+ dense_scores = (self.embeddings @ query_vec)
64
+
65
+ # NumPy 2.x-safe normalization
66
+ def _norm(x: np.ndarray) -> np.ndarray:
67
+ x = np.asarray(x, dtype=float)
68
+ rng = np.ptp(x)
69
+ return (x - x.min()) / (rng + 1e-8)
70
+
71
+ bm25_norm = _norm(bm25_scores)
72
+ dense_norm = _norm(dense_scores)
73
+ scores = alpha * bm25_norm + (1 - alpha) * dense_norm
74
+
75
+ # Optional metadata filters
76
+ idxs = np.arange(len(self.docs))
77
+ if filters:
78
+ def ok(d: Doc) -> bool:
79
+ for kf, vf in filters.items():
80
+ if kf not in d.metadata:
81
+ return False
82
+ dv = str(d.metadata[kf]).lower()
83
+ if isinstance(vf, (list, tuple, set)):
84
+ if not any(str(x).lower() in dv for x in vf):
85
+ return False
86
+ else:
87
+ if str(vf).lower() not in dv:
88
+ return False
89
+ return True
90
+
91
+ keep = [i for i in idxs if ok(self.docs[int(i)])]
92
+ if not keep:
93
+ return []
94
+ idxs = np.array(keep, dtype=int)
95
+ scores = scores[idxs]
96
+
97
+ # Top-k results
98
+ order = np.argsort(-scores)[:k]
99
+ return [(self.docs[int(idxs[i])], float(scores[i])) for i in order]
100
+
101
+ class Reranker:
102
+ def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
103
+ self.model = CrossEncoder(model_name)
104
+
105
+ def rerank(self, query: str, docs: List[Tuple[Doc, float]], top_k: int = 5) -> List[Tuple[Doc, float]]:
106
+ if not docs:
107
+ return []
108
+ pairs = [(query, d.text) for d, _ in docs]
109
+ scores = self.model.predict(pairs)
110
+ rescored = list(zip([d for d,_ in docs], [float(s) for s in scores]))
111
+ rescored.sort(key=lambda x: -x[1])
112
+ return rescored[:top_k]
113
+
114
+ def select_fewshots(query: str, fewshots: List[Dict[str, str]], embedder: SentenceTransformer, n: int = 2):
115
+ if not fewshots:
116
+ return []
117
+ qv = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
118
+ ex_vecs = embedder.encode([fs["q"] for fs in fewshots], convert_to_numpy=True, normalize_embeddings=True)
119
+ sims = ex_vecs @ qv
120
+ order = np.argsort(-sims)[:n]
121
+ return [fewshots[i] for i in order]