init project
Browse files- app.py +8 -0
- rag_core/embedder.py +4 -4
- rag_core/retriever.py +36 -4
app.py
CHANGED
|
@@ -27,6 +27,14 @@ async def ask_api(req: Request):
|
|
| 27 |
answer = generate_answer(prompt)
|
| 28 |
return {"answer": answer}
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
# Gradio UI
|
| 31 |
iface = gr.Interface(
|
| 32 |
fn=lambda q: generate_answer("\n\n".join(retriever.query(q, get_embedding)) + f"\n\nCâu hỏi: {q}\nTrả lời:"),
|
|
|
|
| 27 |
answer = generate_answer(prompt)
|
| 28 |
return {"answer": answer}
|
| 29 |
|
| 30 |
+
@app.post("/rescan")
|
| 31 |
+
async def rescan_api():
|
| 32 |
+
with open("data/raw_law.txt", "r", encoding="utf-8") as f:
|
| 33 |
+
text = f.read()
|
| 34 |
+
chunks = chunk_legal_text(text)
|
| 35 |
+
retriever.rescan_and_append(chunks, get_embedding)
|
| 36 |
+
return {"status": "Rescan & update thành công."}
|
| 37 |
+
|
| 38 |
# Gradio UI
|
| 39 |
iface = gr.Interface(
|
| 40 |
fn=lambda q: generate_answer("\n\n".join(retriever.query(q, get_embedding)) + f"\n\nCâu hỏi: {q}\nTrả lời:"),
|
rag_core/embedder.py
CHANGED
|
@@ -10,13 +10,13 @@ def get_embedding(text: str, retries: int = 3):
|
|
| 10 |
response = requests.post(
|
| 11 |
"https://vietcat-phobertnode.hf.space/embed",
|
| 12 |
json={"text": text},
|
| 13 |
-
timeout=30
|
| 14 |
)
|
| 15 |
-
response.raise_for_status()
|
| 16 |
return response.json()["embedding"]
|
| 17 |
except requests.exceptions.RequestException as e:
|
| 18 |
logging.warning(f"Lỗi embedding (lần {i+1}/{retries}): {e}")
|
| 19 |
if i < retries - 1:
|
| 20 |
-
time.sleep(2)
|
| 21 |
else:
|
| 22 |
-
raise
|
|
|
|
| 10 |
response = requests.post(
|
| 11 |
"https://vietcat-phobertnode.hf.space/embed",
|
| 12 |
json={"text": text},
|
| 13 |
+
timeout=30
|
| 14 |
)
|
| 15 |
+
response.raise_for_status()
|
| 16 |
return response.json()["embedding"]
|
| 17 |
except requests.exceptions.RequestException as e:
|
| 18 |
logging.warning(f"Lỗi embedding (lần {i+1}/{retries}): {e}")
|
| 19 |
if i < retries - 1:
|
| 20 |
+
time.sleep(2)
|
| 21 |
else:
|
| 22 |
+
raise
|
rag_core/retriever.py
CHANGED
|
@@ -2,6 +2,7 @@ import faiss
|
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
import pickle
|
|
|
|
| 5 |
from rag_core.utils import log_timed
|
| 6 |
|
| 7 |
INDEX_PATH = "faiss_index/index.faiss"
|
|
@@ -19,17 +20,48 @@ class Retriever:
|
|
| 19 |
|
| 20 |
@log_timed("xây FAISS index")
|
| 21 |
def build(self, texts: list, embed_fn):
|
| 22 |
-
embeddings = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
dim = len(embeddings[0])
|
| 24 |
self.index = faiss.IndexFlatL2(dim)
|
| 25 |
self.index.add(np.array(embeddings).astype("float32"))
|
| 26 |
faiss.write_index(self.index, INDEX_PATH)
|
| 27 |
with open(META_PATH, "wb") as f:
|
| 28 |
-
pickle.dump(
|
| 29 |
-
self.texts =
|
| 30 |
|
| 31 |
@log_timed("truy vấn FAISS")
|
| 32 |
def query(self, query_text, embed_fn, k=3):
|
| 33 |
q_emb = np.array([embed_fn(query_text)]).astype("float32")
|
| 34 |
D, I = self.index.search(q_emb, k)
|
| 35 |
-
return [self.texts[i] for i in I[0]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
import pickle
|
| 5 |
+
import logging
|
| 6 |
from rag_core.utils import log_timed
|
| 7 |
|
| 8 |
INDEX_PATH = "faiss_index/index.faiss"
|
|
|
|
| 20 |
|
| 21 |
@log_timed("xây FAISS index")
|
| 22 |
def build(self, texts: list, embed_fn):
|
| 23 |
+
embeddings = []
|
| 24 |
+
valid_texts = []
|
| 25 |
+
for i, t in enumerate(texts):
|
| 26 |
+
try:
|
| 27 |
+
emb = embed_fn(t)
|
| 28 |
+
embeddings.append(emb)
|
| 29 |
+
valid_texts.append(t)
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logging.warning(f"❌ Lỗi embedding chunk {i}: {e}")
|
| 32 |
+
if not embeddings:
|
| 33 |
+
raise RuntimeError("Không có embedding nào thành công!")
|
| 34 |
dim = len(embeddings[0])
|
| 35 |
self.index = faiss.IndexFlatL2(dim)
|
| 36 |
self.index.add(np.array(embeddings).astype("float32"))
|
| 37 |
faiss.write_index(self.index, INDEX_PATH)
|
| 38 |
with open(META_PATH, "wb") as f:
|
| 39 |
+
pickle.dump(valid_texts, f)
|
| 40 |
+
self.texts = valid_texts
|
| 41 |
|
| 42 |
@log_timed("truy vấn FAISS")
|
| 43 |
def query(self, query_text, embed_fn, k=3):
|
| 44 |
q_emb = np.array([embed_fn(query_text)]).astype("float32")
|
| 45 |
D, I = self.index.search(q_emb, k)
|
| 46 |
+
return [self.texts[i] for i in I[0]]
|
| 47 |
+
|
| 48 |
+
@log_timed("bổ sung embedding bị thiếu")
|
| 49 |
+
def rescan_and_append(self, full_texts, embed_fn):
|
| 50 |
+
existing_set = set(self.texts)
|
| 51 |
+
new_texts = [t for t in full_texts if t not in existing_set]
|
| 52 |
+
if not new_texts:
|
| 53 |
+
logging.info("Không có chunk mới để thêm.")
|
| 54 |
+
return
|
| 55 |
+
new_embeddings = []
|
| 56 |
+
for i, t in enumerate(new_texts):
|
| 57 |
+
try:
|
| 58 |
+
emb = embed_fn(t)
|
| 59 |
+
new_embeddings.append(emb)
|
| 60 |
+
self.texts.append(t)
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logging.warning(f"❌ Lỗi embedding chunk mới {i}: {e}")
|
| 63 |
+
if new_embeddings:
|
| 64 |
+
self.index.add(np.array(new_embeddings).astype("float32"))
|
| 65 |
+
faiss.write_index(self.index, INDEX_PATH)
|
| 66 |
+
with open(META_PATH, "wb") as f:
|
| 67 |
+
pickle.dump(self.texts, f)
|