Ryanfafa commited on
Commit
746cdfd
·
verified ·
1 Parent(s): 9efdf3f

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +39 -53
rag_engine.py CHANGED
@@ -1,18 +1,16 @@
1
  """
2
  RAG Engine
3
  Embeddings : sentence-transformers/all-MiniLM-L6-v2
4
- Vector DB : ChromaDB (local, in-memory / persistent)
5
- LLM : HuggingFace Router API (tries multiple free models)
6
  Chunking : Recursive character splitter with overlap
7
  """
8
 
9
  import os
10
  import re
11
- import requests
12
  import tempfile
13
  from typing import Tuple, List
14
 
15
- import chromadb
16
  from chromadb.config import Settings
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
18
  from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -27,12 +25,12 @@ TOP_K = 4
27
  COLLECTION_NAME = "docmind_collection"
28
  CHROMA_DIR = "./chroma_db"
29
 
30
- # Free models to try in order
31
  CANDIDATE_MODELS = [
32
  "mistralai/Mistral-7B-Instruct-v0.3",
33
  "microsoft/Phi-3.5-mini-instruct",
34
- "google/gemma-2-2b-it",
35
  "HuggingFaceH4/zephyr-7b-beta",
 
36
  ]
37
 
38
 
@@ -106,66 +104,53 @@ class RAGEngine:
106
  if not hf_token:
107
  excerpt = extract_best(question, context)
108
  return (
109
- "HF_TOKEN not set. To enable AI answers:\n"
110
- "1. Get a free token at huggingface.co/settings/tokens\n"
111
- "2. Add it as a Secret named HF_TOKEN in your Space Settings\n\n"
112
- "Most relevant excerpt from your document:\n\n" + excerpt
113
  )
114
 
115
- headers = {
116
- "Content-Type": "application/json",
117
- "Authorization": "Bearer " + hf_token,
118
- }
119
- messages = [
120
- {
121
- "role": "system",
122
- "content": (
123
- "You are DocMind, an expert document analyst. "
124
- "Answer using ONLY the provided document context. "
125
- "Be concise and cite specific details. "
126
- "If the answer is not in the context, say so clearly."
127
- ),
128
- },
129
- {
130
- "role": "user",
131
- "content": "Document context:\n" + context + "\n\nQuestion: " + question,
132
- },
133
- ]
134
 
135
  last_error = ""
136
  for model_id in CANDIDATE_MODELS:
137
  try:
138
- url = (
139
- "https://router.huggingface.co/hf-inference/models/"
140
- + model_id
141
- + "/v1/chat/completions"
142
- )
143
- resp = requests.post(
144
- url,
145
- headers=headers,
146
- json={
147
- "model": model_id,
148
- "messages": messages,
149
- "max_tokens": 512,
150
- "temperature": 0.2,
151
- },
152
  timeout=60,
153
  )
154
- if resp.status_code == 200:
155
- answer = resp.json()["choices"][0]["message"]["content"].strip()
156
- if answer:
157
- return answer
158
- else:
159
- last_error = str(resp.status_code) + ": " + resp.text[:200]
 
 
 
 
 
160
  except Exception as e:
161
  last_error = str(e)
162
  continue
163
 
 
164
  excerpt = extract_best(question, context)
165
  return (
166
- "LLM models unavailable - showing most relevant excerpt:\n\n"
167
  + excerpt
168
- + "\n\nLast error: " + last_error
169
  )
170
 
171
 
@@ -179,8 +164,9 @@ def extract_best(question: str, context: str) -> str:
179
  if score > best_score:
180
  best_score = score
181
  best_chunk = chunk.strip()
182
- excerpt = best_chunk[:600] + ("..." if len(best_chunk) > 600 else "")
183
- return excerpt or "No relevant content found."
 
184
 
185
 
186
  def get_suffix(name: str) -> str:
 
1
  """
2
  RAG Engine
3
  Embeddings : sentence-transformers/all-MiniLM-L6-v2
4
+ Vector DB : ChromaDB (local)
5
+ LLM : HuggingFace InferenceClient (official library, auto-routing)
6
  Chunking : Recursive character splitter with overlap
7
  """
8
 
9
  import os
10
  import re
 
11
  import tempfile
12
  from typing import Tuple, List
13
 
 
14
  from chromadb.config import Settings
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
25
  COLLECTION_NAME = "docmind_collection"
26
  CHROMA_DIR = "./chroma_db"
27
 
28
+ # Models to try in order (all free with HF token)
29
  CANDIDATE_MODELS = [
30
  "mistralai/Mistral-7B-Instruct-v0.3",
31
  "microsoft/Phi-3.5-mini-instruct",
 
32
  "HuggingFaceH4/zephyr-7b-beta",
33
+ "google/gemma-2-2b-it",
34
  ]
35
 
36
 
 
104
  if not hf_token:
105
  excerpt = extract_best(question, context)
106
  return (
107
+ "HF_TOKEN not set. Add it as a Secret in your Space Settings.\n\n"
108
+ "Best matching excerpt:\n\n" + excerpt
 
 
109
  )
110
 
111
+ # Use the official huggingface_hub InferenceClient
112
+ try:
113
+ from huggingface_hub import InferenceClient
114
+ except ImportError:
115
+ return "huggingface_hub not installed. Check requirements.txt."
116
+
117
+ system_prompt = (
118
+ "You are DocMind, an expert document analyst. "
119
+ "Answer using ONLY the provided document context. "
120
+ "Be concise and accurate. "
121
+ "If the answer is not in the context, say so clearly."
122
+ )
123
+ user_message = "Document context:\n" + context + "\n\nQuestion: " + question
 
 
 
 
 
 
124
 
125
  last_error = ""
126
  for model_id in CANDIDATE_MODELS:
127
  try:
128
+ client = InferenceClient(
129
+ model=model_id,
130
+ token=hf_token,
 
 
 
 
 
 
 
 
 
 
 
131
  timeout=60,
132
  )
133
+ result = client.chat_completion(
134
+ messages=[
135
+ {"role": "system", "content": system_prompt},
136
+ {"role": "user", "content": user_message},
137
+ ],
138
+ max_tokens=512,
139
+ temperature=0.2,
140
+ )
141
+ answer = result.choices[0].message.content.strip()
142
+ if answer:
143
+ return answer
144
  except Exception as e:
145
  last_error = str(e)
146
  continue
147
 
148
+ # All models failed — use extractive fallback
149
  excerpt = extract_best(question, context)
150
  return (
151
+ "AI answer unavailable. Here is the most relevant part of your document:\n\n"
152
  + excerpt
153
+ + "\n\n(Error: " + last_error + ")"
154
  )
155
 
156
 
 
164
  if score > best_score:
165
  best_score = score
166
  best_chunk = chunk.strip()
167
+ if not best_chunk:
168
+ return "No relevant content found."
169
+ return best_chunk[:600] + ("..." if len(best_chunk) > 600 else "")
170
 
171
 
172
  def get_suffix(name: str) -> str: