VietCat commited on
Commit
d01f520
·
1 Parent(s): 8f25a7a

init project

Browse files
Files changed (3) hide show
  1. app.py +8 -0
  2. rag_core/embedder.py +4 -4
  3. rag_core/retriever.py +36 -4
app.py CHANGED
@@ -27,6 +27,14 @@ async def ask_api(req: Request):
27
  answer = generate_answer(prompt)
28
  return {"answer": answer}
29
 
 
 
 
 
 
 
 
 
30
  # Gradio UI
31
  iface = gr.Interface(
32
  fn=lambda q: generate_answer("\n\n".join(retriever.query(q, get_embedding)) + f"\n\nCâu hỏi: {q}\nTrả lời:"),
 
27
  answer = generate_answer(prompt)
28
  return {"answer": answer}
29
 
30
+ @app.post("/rescan")
31
+ async def rescan_api():
32
+ with open("data/raw_law.txt", "r", encoding="utf-8") as f:
33
+ text = f.read()
34
+ chunks = chunk_legal_text(text)
35
+ retriever.rescan_and_append(chunks, get_embedding)
36
+ return {"status": "Rescan & update thành công."}
37
+
38
  # Gradio UI
39
  iface = gr.Interface(
40
  fn=lambda q: generate_answer("\n\n".join(retriever.query(q, get_embedding)) + f"\n\nCâu hỏi: {q}\nTrả lời:"),
rag_core/embedder.py CHANGED
@@ -10,13 +10,13 @@ def get_embedding(text: str, retries: int = 3):
10
  response = requests.post(
11
  "https://vietcat-phobertnode.hf.space/embed",
12
  json={"text": text},
13
- timeout=30 # Tăng từ 10 lên 30 giây
14
  )
15
- response.raise_for_status() # Nếu không 200 -> raise exception
16
  return response.json()["embedding"]
17
  except requests.exceptions.RequestException as e:
18
  logging.warning(f"Lỗi embedding (lần {i+1}/{retries}): {e}")
19
  if i < retries - 1:
20
- time.sleep(2) # Đợi 2s rồi thử lại
21
  else:
22
- raise RuntimeError(f"Không thể lấy embedding sau {retries} lần thử.")
 
10
  response = requests.post(
11
  "https://vietcat-phobertnode.hf.space/embed",
12
  json={"text": text},
13
+ timeout=30
14
  )
15
+ response.raise_for_status()
16
  return response.json()["embedding"]
17
  except requests.exceptions.RequestException as e:
18
  logging.warning(f"Lỗi embedding (lần {i+1}/{retries}): {e}")
19
  if i < retries - 1:
20
+ time.sleep(2)
21
  else:
22
+ raise
rag_core/retriever.py CHANGED
@@ -2,6 +2,7 @@ import faiss
2
  import numpy as np
3
  import os
4
  import pickle
 
5
  from rag_core.utils import log_timed
6
 
7
  INDEX_PATH = "faiss_index/index.faiss"
@@ -19,17 +20,48 @@ class Retriever:
19
 
20
  @log_timed("xây FAISS index")
21
  def build(self, texts: list, embed_fn):
22
- embeddings = [embed_fn(t) for t in texts]
 
 
 
 
 
 
 
 
 
 
23
  dim = len(embeddings[0])
24
  self.index = faiss.IndexFlatL2(dim)
25
  self.index.add(np.array(embeddings).astype("float32"))
26
  faiss.write_index(self.index, INDEX_PATH)
27
  with open(META_PATH, "wb") as f:
28
- pickle.dump(texts, f)
29
- self.texts = texts
30
 
31
  @log_timed("truy vấn FAISS")
32
  def query(self, query_text, embed_fn, k=3):
33
  q_emb = np.array([embed_fn(query_text)]).astype("float32")
34
  D, I = self.index.search(q_emb, k)
35
- return [self.texts[i] for i in I[0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import os
4
  import pickle
5
+ import logging
6
  from rag_core.utils import log_timed
7
 
8
  INDEX_PATH = "faiss_index/index.faiss"
 
20
 
21
  @log_timed("xây FAISS index")
22
  def build(self, texts: list, embed_fn):
23
+ embeddings = []
24
+ valid_texts = []
25
+ for i, t in enumerate(texts):
26
+ try:
27
+ emb = embed_fn(t)
28
+ embeddings.append(emb)
29
+ valid_texts.append(t)
30
+ except Exception as e:
31
+ logging.warning(f"❌ Lỗi embedding chunk {i}: {e}")
32
+ if not embeddings:
33
+ raise RuntimeError("Không có embedding nào thành công!")
34
  dim = len(embeddings[0])
35
  self.index = faiss.IndexFlatL2(dim)
36
  self.index.add(np.array(embeddings).astype("float32"))
37
  faiss.write_index(self.index, INDEX_PATH)
38
  with open(META_PATH, "wb") as f:
39
+ pickle.dump(valid_texts, f)
40
+ self.texts = valid_texts
41
 
42
  @log_timed("truy vấn FAISS")
43
  def query(self, query_text, embed_fn, k=3):
44
  q_emb = np.array([embed_fn(query_text)]).astype("float32")
45
  D, I = self.index.search(q_emb, k)
46
+ return [self.texts[i] for i in I[0]]
47
+
48
+ @log_timed("bổ sung embedding bị thiếu")
49
+ def rescan_and_append(self, full_texts, embed_fn):
50
+ existing_set = set(self.texts)
51
+ new_texts = [t for t in full_texts if t not in existing_set]
52
+ if not new_texts:
53
+ logging.info("Không có chunk mới để thêm.")
54
+ return
55
+ new_embeddings = []
56
+ for i, t in enumerate(new_texts):
57
+ try:
58
+ emb = embed_fn(t)
59
+ new_embeddings.append(emb)
60
+ self.texts.append(t)
61
+ except Exception as e:
62
+ logging.warning(f"❌ Lỗi embedding chunk mới {i}: {e}")
63
+ if new_embeddings:
64
+ self.index.add(np.array(new_embeddings).astype("float32"))
65
+ faiss.write_index(self.index, INDEX_PATH)
66
+ with open(META_PATH, "wb") as f:
67
+ pickle.dump(self.texts, f)