pluto90 commited on
Commit
f06dea6
Β·
verified Β·
1 Parent(s): f383d3f

Upload 6 files

Browse files
app/core/embedding_engine.py CHANGED
@@ -1,160 +1,233 @@
1
- # # embedding_engine.py
2
-
3
- # import uuid
4
- # from qdrant_client import QdrantClient, models
5
- # from qdrant_client.http.models import Distance, VectorParams
6
- # from sentence_transformers import SentenceTransformer
7
- # from app.core.config import QDRANT_URL, QDRANT_API_KEY
8
- # # from config import QDRANT_URL, QDRANT_API_KEY
9
-
10
- # # embedder = SentenceTransformer("all-MiniLM-L6-v2")
11
- # # embedder.save("models/all-MiniLM-L6-v2")
12
-
13
-
14
- # MODEL_PATH = "app/core/models/all-MiniLM-L6-v2"
15
- # embedder = SentenceTransformer(MODEL_PATH)
16
-
17
- # qdrant = QdrantClient(
18
- # url=QDRANT_URL,
19
- # api_key=QDRANT_API_KEY,
20
- # check_compatibility=False
21
- # )
22
-
23
- # COLLECTION_NAME = "smartnotes"
24
- # BATCH_SIZE = 100
25
-
26
-
27
- # def ensure_collection():
28
- # collections = qdrant.get_collections().collections
29
- # if COLLECTION_NAME not in [c.name for c in collections]:
30
- # qdrant.create_collection(
31
- # collection_name=COLLECTION_NAME,
32
- # vectors_config=VectorParams(
33
- # size=384,
34
- # distance=Distance.COSINE
35
- # ),
36
- # )
37
-
38
- # # βœ… Add this part
39
- # qdrant.create_payload_index(
40
- # collection_name=COLLECTION_NAME,
41
- # field_name="doc_id",
42
- # field_schema="keyword"
43
- # )
44
-
45
-
46
- # def embed_and_store(text_chunks, doc_id):
47
- # """Embed chunks and store them in Qdrant efficiently."""
48
- # ensure_collection()
49
- # print(f"πŸ”Ή Embedding {len(text_chunks)} chunks...")
50
-
51
- # # Generate embeddings
52
- # vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
53
-
54
- # # Prepare points
55
- # points = [
56
- # models.PointStruct(
57
- # id=str(uuid.uuid4()),
58
- # vector=vectors[i],
59
- # payload={"doc_id": doc_id, "text": text_chunks[i]},
60
- # )
61
- # for i in range(len(vectors))
62
- # ]
63
-
64
- # # βœ… Upsert in small batches to avoid timeouts
65
- # print("πŸ”Ή Uploading to Qdrant in batches...")
66
- # for i in range(0, len(points), BATCH_SIZE):
67
- # batch = points[i:i + BATCH_SIZE]
68
- # qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
69
- # print(f" β†’ Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
70
-
71
- # print("βœ… All embeddings stored successfully!")
72
-
73
-
74
-
75
-
76
-
77
-
78
-
79
-
80
-
81
-
82
-
83
-
84
-
85
-
86
-
87
-
88
-
89
-
90
-
91
- # embedding_engine.py
92
-
93
- import uuid
94
- from qdrant_client import QdrantClient, models
95
- from qdrant_client.http.models import Distance, VectorParams
96
- from sentence_transformers import SentenceTransformer
97
- from app.core.config import QDRANT_URL, QDRANT_API_KEY
98
-
99
- # embedder = SentenceTransformer("all-MiniLM-L6-v2")
100
- # embedder.save("models/all-MiniLM-L6-v2")
101
-
102
-
103
- MODEL_PATH = "app/core/models/all-MiniLM-L6-v2"
104
- embedder = SentenceTransformer(MODEL_PATH)
105
-
106
- qdrant = QdrantClient(
107
- url=QDRANT_URL,
108
- api_key=QDRANT_API_KEY,
109
- check_compatibility=False
110
- )
111
-
112
- COLLECTION_NAME = "smartnotes"
113
- BATCH_SIZE = 100
114
-
115
-
116
- def ensure_collection():
117
- collections = qdrant.get_collections().collections
118
- if COLLECTION_NAME not in [c.name for c in collections]:
119
- qdrant.create_collection(
120
- collection_name=COLLECTION_NAME,
121
- vectors_config=VectorParams(
122
- size=384,
123
- distance=Distance.COSINE
124
- ),
125
- )
126
-
127
- # βœ… Add this part
128
- qdrant.create_payload_index(
129
- collection_name=COLLECTION_NAME,
130
- field_name="doc_id",
131
- field_schema="keyword"
132
- )
133
-
134
-
135
- def embed_and_store(text_chunks, doc_id):
136
- """Embed chunks and store them in Qdrant efficiently."""
137
- ensure_collection()
138
- print(f"πŸ”Ή Embedding {len(text_chunks)} chunks...")
139
-
140
- # Generate embeddings
141
- vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
142
-
143
- # Prepare points
144
- points = [
145
- models.PointStruct(
146
- id=str(uuid.uuid4()),
147
- vector=vectors[i],
148
- payload={"doc_id": doc_id, "text": text_chunks[i]},
149
- )
150
- for i in range(len(vectors))
151
- ]
152
-
153
- # βœ… Upsert in small batches to avoid timeouts
154
- print("πŸ”Ή Uploading to Qdrant in batches...")
155
- for i in range(0, len(points), BATCH_SIZE):
156
- batch = points[i:i + BATCH_SIZE]
157
- qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
158
- print(f" β†’ Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
159
-
160
- print("βœ… All embeddings stored successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # embedding_engine.py
2
+
3
+ # import uuid, time
4
+ # from qdrant_client import QdrantClient, models
5
+ # from qdrant_client.http.models import Distance, VectorParams
6
+ # from qdrant_client.http.exceptions import UnexpectedResponse
7
+ # from sentence_transformers import SentenceTransformer
8
+ # from app.core.config import QDRANT_URL, QDRANT_API_KEY
9
+
10
+ # MODEL_PATH = "app/core/models/bge-base-en-v1.5"
11
+ # embedder = SentenceTransformer(MODEL_PATH)
12
+
13
+ # qdrant = QdrantClient(
14
+ # url=QDRANT_URL,
15
+ # api_key=QDRANT_API_KEY,
16
+ # check_compatibility=False
17
+ # )
18
+
19
+ # COLLECTION_NAME = "smartnotes"
20
+ # BATCH_SIZE = 10
21
+
22
+
23
+ # def ensure_collection():
24
+ # collections = qdrant.get_collections().collections
25
+ # if COLLECTION_NAME not in [c.name for c in collections]:
26
+ # qdrant.create_collection(
27
+ # collection_name=COLLECTION_NAME,
28
+ # vectors_config=VectorParams(
29
+ # size=768,
30
+ # distance=Distance.COSINE
31
+ # ),
32
+ # )
33
+
34
+ # # βœ… Add this part
35
+ # qdrant.create_payload_index(
36
+ # collection_name=COLLECTION_NAME,
37
+ # field_name="doc_id",
38
+ # field_schema="keyword"
39
+ # )
40
+
41
+
42
+ # def embed_and_store(text_chunks, doc_id):
43
+ # print(f"πŸ“Š Embedding and storing {len(text_chunks)} chunks...")
44
+ # ensure_collection()
45
+
46
+ # print(f"πŸ”Ή Embedding {len(text_chunks)} chunks...")
47
+
48
+ # vectors = embed_documents(text_chunks)
49
+
50
+ # points = [
51
+ # models.PointStruct(
52
+ # id=str(uuid.uuid4()),
53
+ # vector=vectors[i],
54
+ # payload={
55
+ # "doc_id": doc_id,
56
+ # "text": text_chunks[i],
57
+ # "chunk_id": i,
58
+ # "length": len(text_chunks[i])
59
+ # },
60
+ # )
61
+ # for i in range(len(vectors))
62
+ # ]
63
+
64
+ # print("πŸ”Ή Uploading to Qdrant in batches...")
65
+
66
+ # for i in range(0, len(points), BATCH_SIZE):
67
+ # batch = points[i:i + BATCH_SIZE]
68
+
69
+ # success = False
70
+ # retries = 3
71
+
72
+ # while not success and retries > 0:
73
+ # try:
74
+ # qdrant.upsert(
75
+ # collection_name=COLLECTION_NAME,
76
+ # points=batch
77
+ # )
78
+ # success = True
79
+ # print(f" β†’ Uploaded batch {i // BATCH_SIZE + 1}")
80
+
81
+ # except Exception as e:
82
+ # print("❌ Qdrant error:", e)
83
+ # retries -= 1
84
+ # time.sleep(1.5) # πŸ”₯ increase wait
85
+
86
+ # if not success:
87
+ # print("⚠️ Skipping batch after retries")
88
+
89
+ # time.sleep(0.4) # πŸ”₯ throttle
90
+
91
+
92
+
93
+ # def embed_documents(texts):
94
+ # vectors= []
95
+
96
+ # for i in range(0, len(texts), 32):
97
+ # batch = texts[i:i+32]
98
+ # batch_vectors = embedder.encode(batch, show_progress_bar=False)
99
+ # vectors.extend(batch_vectors.tolist())
100
+
101
+ # return vectors
102
+
103
+
104
+ # def embed_query(text):
105
+ # return embedder.encode(
106
+ # f"query: {text}",
107
+ # normalize_embeddings=True
108
+ # )
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+ # embedding_engine.py
129
+ import uuid, time
130
+ from qdrant_client import QdrantClient, models
131
+ from qdrant_client.http.models import Distance, VectorParams
132
+ from sentence_transformers import SentenceTransformer
133
+ from app.core.config import QDRANT_URL, QDRANT_API_KEY
134
+
135
+ MODEL_PATH = "app/core/models/bge-base-en-v1.5"
136
+ embedder = SentenceTransformer(MODEL_PATH)
137
+
138
+ qdrant = QdrantClient(
139
+ url=QDRANT_URL,
140
+ api_key=QDRANT_API_KEY,
141
+ check_compatibility=False
142
+ )
143
+
144
+ COLLECTION_NAME = "smartnotes"
145
+ BATCH_SIZE = 5 # βœ… reduced for free tier
146
+
147
+
148
+ def ensure_collection():
149
+ collections = qdrant.get_collections().collections
150
+ if COLLECTION_NAME not in [c.name for c in collections]:
151
+ qdrant.create_collection(
152
+ collection_name=COLLECTION_NAME,
153
+ vectors_config=VectorParams(size=768, distance=Distance.COSINE),
154
+ )
155
+ qdrant.create_payload_index(
156
+ collection_name=COLLECTION_NAME,
157
+ field_name="doc_id",
158
+ field_schema="keyword"
159
+ )
160
+
161
+
162
+ def embed_and_store(text_chunks, doc_id):
163
+ print(f"πŸ“Š Final chunks being embedded: {len(text_chunks)}")
164
+ ensure_collection()
165
+
166
+ vectors = embed_documents(text_chunks) # βœ… now uses correct doc prefix
167
+
168
+ points = [
169
+ models.PointStruct(
170
+ id=str(uuid.uuid4()),
171
+ vector=vectors[i],
172
+ payload={
173
+ "doc_id": doc_id,
174
+ "text": text_chunks[i],
175
+ "chunk_id": i,
176
+ "length": len(text_chunks[i])
177
+ },
178
+ )
179
+ for i in range(len(vectors))
180
+ ]
181
+
182
+ failed_batches = []
183
+
184
+ for i in range(0, len(points), BATCH_SIZE):
185
+ batch = points[i:i + BATCH_SIZE]
186
+ batch_num = i // BATCH_SIZE + 1
187
+ success = False
188
+
189
+ for attempt in range(4): # βœ… 4 attempts with exponential backoff
190
+ try:
191
+ qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
192
+ success = True
193
+ print(f" β†’ Batch {batch_num} uploaded")
194
+ break
195
+ except Exception as e:
196
+ wait = 2 ** attempt # 1s, 2s, 4s, 8s
197
+ print(f" ⚠️ Batch {batch_num} attempt {attempt+1} failed: {e} | retrying in {wait}s")
198
+ time.sleep(wait)
199
+
200
+ if not success:
201
+ failed_batches.append(batch_num)
202
+ print(f" ❌ Batch {batch_num} permanently failed")
203
+
204
+ time.sleep(0.6) # βœ… throttle between successful batches
205
+
206
+ if failed_batches:
207
+ # βœ… raise so the caller (routes.py) knows something went wrong
208
+ raise RuntimeError(f"Failed to upload batches: {failed_batches}")
209
+
210
+ print(f"βœ… All batches uploaded for doc_id={doc_id}")
211
+
212
+
213
+ def embed_documents(texts):
214
+ """Embed document chunks with correct BGE prefix and normalization."""
215
+ prefixed = [f"Represent this sentence: {t}" for t in texts] # βœ… correct BGE doc prefix
216
+ vectors = []
217
+ for i in range(0, len(prefixed), 32):
218
+ batch = prefixed[i:i + 32]
219
+ batch_vectors = embedder.encode(
220
+ batch, normalize_embeddings=True, show_progress_bar=False)
221
+
222
+ vectors.extend(batch_vectors.tolist())
223
+ return vectors
224
+
225
+
226
+ def embed_query(text):
227
+ """Embed a search query β€” BGE uses 'query:' prefix for retrieval."""
228
+ return embedder.encode(
229
+ f"query: {text}",
230
+ normalize_embeddings=True
231
+ ).tolist() # βœ… always return list, not numpy array
232
+
233
+
app/core/llm_engine.py CHANGED
@@ -1,19 +1,41 @@
1
- # llm_engine.py
2
-
3
- import google.generativeai as genai
4
- from app.core.config import GEMINI_API_KEY
5
-
6
- from langchain_google_genai import ChatGoogleGenerativeAI
7
-
8
- # βœ… Configure Gemini client
9
- genai.configure(api_key=GEMINI_API_KEY)
10
-
11
-
12
- llm = ChatGoogleGenerativeAI(
13
- model="gemini-2.5-flash",
14
- google_api_key=GEMINI_API_KEY,
15
- temperature=0.2,
16
- max_output_tokens=500,
17
- convert_system_message_to_human=True
18
- )
19
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # llm_engine.py
2
+
3
+ import google.generativeai as genai
4
+ from app.core.config import GEMINI_API_KEY
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+
7
+ # βœ… Configure Gemini client
8
+ genai.configure(api_key=GEMINI_API_KEY)
9
+
10
+
11
+ llm = ChatGoogleGenerativeAI(
12
+ model="gemini-2.5-flash",
13
+ google_api_key=GEMINI_API_KEY,
14
+ temperature=0.2,
15
+ max_output_tokens=800,
16
+ )
17
+
18
+
19
+ # # βœ… Separate LLM for evaluator β€” needs near-deterministic JSON output
20
+ # eval_llm = ChatGoogleGenerativeAI(
21
+ # model="gemini-2.5-flash",
22
+ # google_api_key=GEMINI_API_KEY,
23
+ # temperature=0.0, # βœ… deterministic β€” evaluator must return valid JSON
24
+ # max_output_tokens=200, # βœ… evaluator only returns a small JSON blob
25
+ # thinking_level="none" # to disable chain-of-thought
26
+ # )
27
+
28
+
29
+ eval_llm = ChatGoogleGenerativeAI(
30
+ model="gemini-2.0-flash", # no thinking, faster
31
+ google_api_key=GEMINI_API_KEY,
32
+ temperature=0.0,
33
+ max_output_tokens=200,
34
+ # model_kwargs={
35
+ # "generation_config": {
36
+ # "thinking_config": {
37
+ # "thinking_budget": 0 # βœ… 0 = disabled, bypasses langchain validation entirely
38
+ # }
39
+ # }
40
+ # }
41
+ )
app/core/pdf_processor.py CHANGED
@@ -1,120 +1,52 @@
1
- # # pdf_preprocessor.py
2
-
3
- # import os
4
- # from pypdf import PdfReader
5
- # from pdf2image import convert_from_path
6
- # import pytesseract
7
-
8
- # # Optional: Set Tesseract path manually on Windows
9
- # # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
10
-
11
- # def extract_text_from_pdf(file_path: str) -> str:
12
- # """
13
- # Extract text from both text-based and image-based PDFs.
14
- # Falls back to OCR using pytesseract if no embedded text is found.
15
- # """
16
- # text_output = []
17
- # reader = PdfReader(file_path)
18
- # total_pages = len(reader.pages)
19
-
20
- # print(f"πŸ“„ Processing PDF: {file_path} ({total_pages} pages)")
21
-
22
- # for page_num, page in enumerate(reader.pages, start=1):
23
- # try:
24
- # # Try normal text extraction
25
- # extracted_text = page.extract_text()
26
- # if extracted_text and extracted_text.strip():
27
- # text_output.append(extracted_text)
28
- # print(f"βœ… Page {page_num}: Extracted embedded text.")
29
- # else:
30
- # # Run OCR if no text found
31
- # print(f"πŸ” Page {page_num}: No text found, running OCR...")
32
- # images = convert_from_path(
33
- # file_path, first_page=page_num, last_page=page_num
34
- # )
35
- # ocr_text = ""
36
- # for img in images:
37
- # ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
38
- # if ocr_text.strip():
39
- # text_output.append(ocr_text)
40
- # print(f"🧠 Page {page_num}: OCR extraction complete.")
41
- # else:
42
- # print(f"⚠️ Page {page_num}: OCR found no readable text.")
43
- # except Exception as e:
44
- # print(f"❌ Error processing page {page_num}: {e}")
45
-
46
- # full_text = "\n".join(text_output)
47
- # if not full_text.strip():
48
- # print("⚠️ Warning: No text extracted from this PDF at all.")
49
- # else:
50
- # print(f"βœ… Done! Extracted {len(full_text.split())} words total.")
51
-
52
- # return full_text
53
-
54
-
55
-
56
-
57
-
58
-
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
-
67
-
68
-
69
- # pdf_preprocessor.py
70
-
71
- import os
72
- from pypdf import PdfReader
73
- from pdf2image import convert_from_path
74
- import pytesseract
75
-
76
- # Optional: Set Tesseract path manually on Windows
77
- # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
78
-
79
- def extract_text_from_pdf(file_path: str) -> str:
80
- """
81
- Extract text from both text-based and image-based PDFs.
82
- Falls back to OCR using pytesseract if no embedded text is found.
83
- """
84
- text_output = []
85
- reader = PdfReader(file_path)
86
- total_pages = len(reader.pages)
87
-
88
- print(f"πŸ“„ Processing PDF: {file_path} ({total_pages} pages)")
89
-
90
- for page_num, page in enumerate(reader.pages, start=1):
91
- try:
92
- # Try normal text extraction
93
- extracted_text = page.extract_text()
94
- if extracted_text and extracted_text.strip():
95
- text_output.append(extracted_text)
96
- print(f"βœ… Page {page_num}: Extracted embedded text.")
97
- else:
98
- # Run OCR if no text found
99
- print(f"πŸ” Page {page_num}: No text found, running OCR...")
100
- images = convert_from_path(
101
- file_path, first_page=page_num, last_page=page_num
102
- )
103
- ocr_text = ""
104
- for img in images:
105
- ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
106
- if ocr_text.strip():
107
- text_output.append(ocr_text)
108
- print(f"🧠 Page {page_num}: OCR extraction complete.")
109
- else:
110
- print(f"⚠️ Page {page_num}: OCR found no readable text.")
111
- except Exception as e:
112
- print(f"❌ Error processing page {page_num}: {e}")
113
-
114
- full_text = "\n".join(text_output)
115
- if not full_text.strip():
116
- print("⚠️ Warning: No text extracted from this PDF at all.")
117
- else:
118
- print(f"βœ… Done! Extracted {len(full_text.split())} words total.")
119
-
120
- return full_text
 
1
+ # pdf_preprocessor.py
2
+
3
+ import os
4
+ from pypdf import PdfReader
5
+ from pdf2image import convert_from_path
6
+ import pytesseract
7
+
8
+ # Optional: Set Tesseract path manually on Windows
9
+ # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
10
+
11
+ def extract_text_from_pdf(file_path: str) -> str:
12
+ """
13
+ Extract text from both text-based and image-based PDFs.
14
+ Falls back to OCR using pytesseract if no embedded text is found.
15
+ """
16
+ text_output = []
17
+ reader = PdfReader(file_path)
18
+ total_pages = len(reader.pages)
19
+
20
+ print(f"πŸ“„ Processing PDF: {file_path} ({total_pages} pages)")
21
+
22
+ for page_num, page in enumerate(reader.pages, start=1):
23
+ try:
24
+ # Try normal text extraction
25
+ extracted_text = page.extract_text()
26
+ if extracted_text and extracted_text.strip():
27
+ text_output.append(extracted_text)
28
+ print(f"βœ… Page {page_num}: Extracted embedded text.")
29
+ else:
30
+ # Run OCR if no text found
31
+ print(f"πŸ” Page {page_num}: No text found, running OCR...")
32
+ images = convert_from_path(
33
+ file_path, first_page=page_num, last_page=page_num
34
+ )
35
+ ocr_text = ""
36
+ for img in images:
37
+ ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
38
+ if ocr_text.strip():
39
+ text_output.append(ocr_text)
40
+ print(f"🧠 Page {page_num}: OCR extraction complete.")
41
+ else:
42
+ print(f"⚠️ Page {page_num}: OCR found no readable text.")
43
+ except Exception as e:
44
+ print(f"❌ Error processing page {page_num}: {e}")
45
+
46
+ full_text = "\n\n".join(text_output)
47
+ if not full_text.strip():
48
+ print("⚠️ Warning: No text extracted from this PDF at all.")
49
+ else:
50
+ print(f"βœ… Done! Extracted {len(full_text.split())} words total.")
51
+
52
+ return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/core/rag_service.py CHANGED
@@ -1,55 +1,34 @@
1
- # app/core/rag_service.py
2
-
3
- from app.core.embedding_engine import embedder, COLLECTION_NAME
4
- from qdrant_client.http.models import Filter, FieldCondition, MatchValue
5
- from qdrant_client import QdrantClient
6
- from app.core.config import QDRANT_URL, QDRANT_API_KEY
7
-
8
- qdrant_client = QdrantClient(
9
- url=QDRANT_URL,
10
- api_key=QDRANT_API_KEY,
11
- check_compatibility=False
12
- )
13
-
14
 
15
- # def get_rag_context(question: str, doc_id: str):
16
- # question_vector = embedder.encode([question])[0].tolist()
17
-
18
- # hits = qdrant_client.query_points(
19
- # collection_name=COLLECTION_NAME,
20
- # query=question_vector,
21
- # query_filter=Filter(
22
- # must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
23
- # ),
24
- # limit=5,
25
- # ).points
26
-
27
- # # context = "\n".join([hit.payload["text"] for hit in hits])
28
-
29
- # contexts = []
30
- # sources = []
31
-
32
- # for hit in hits:
33
- # text = hit.payload.get("text", "")
34
- # contexts.append(text)
35
-
36
- # sources.append({
37
- # "text": text[:300], # limit for UI
38
- # # add page if you have it later
39
- # })
40
-
41
- # context = "\n".join(contexts)
42
-
43
- # return context, sources
44
 
 
 
 
 
 
45
 
46
  # def get_rag_context(query, doc_id, top_k=3):
 
 
47
  # query_vector = embedder.encode(query).tolist()
48
 
 
49
  # results = qdrant_client.query_points(
50
- # collection_name=doc_id,
51
  # query=query_vector,
52
- # limit=top_k
 
 
 
 
 
 
 
 
53
  # )
54
 
55
  # points = results.points
@@ -65,23 +44,38 @@ qdrant_client = QdrantClient(
65
 
66
 
67
 
68
- def get_rag_context(query, doc_id, top_k=3):
69
-
70
- # βœ… Embed query
71
- query_vector = embedder.encode(query).tolist()
72
 
73
- # βœ… Query SINGLE collection + filter by doc_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  results = qdrant_client.query_points(
75
- collection_name="smartnotes", # πŸ”₯ FIXED
76
  query=query_vector,
77
  limit=top_k,
 
78
  query_filter=Filter(
79
- must=[
80
- FieldCondition(
81
- key="doc_id",
82
- match=MatchValue(value=doc_id)
83
- )
84
- ]
85
  )
86
  )
87
 
@@ -90,9 +84,12 @@ def get_rag_context(query, doc_id, top_k=3):
90
  if not points:
91
  return "", [], []
92
 
93
- context = "\n".join([p.payload["text"] for p in points])
94
- sources = [p.payload.get("source") for p in points]
95
  scores = [p.score for p in points]
96
 
97
  return context, sources, scores
98
 
 
 
 
 
1
+ # # app/core/rag_service.py
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ # from app.core.embedding_engine import embedder, COLLECTION_NAME
4
+ # from qdrant_client.http.models import Filter, FieldCondition, MatchValue
5
+ # from qdrant_client import QdrantClient
6
+ # from app.core.config import QDRANT_URL, QDRANT_API_KEY
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # qdrant_client = QdrantClient(
9
+ # url=QDRANT_URL,
10
+ # api_key=QDRANT_API_KEY,
11
+ # check_compatibility=False
12
+ # )
13
 
14
  # def get_rag_context(query, doc_id, top_k=3):
15
+
16
+ # # βœ… Embed query
17
  # query_vector = embedder.encode(query).tolist()
18
 
19
+ # # βœ… Query SINGLE collection + filter by doc_id
20
  # results = qdrant_client.query_points(
21
+ # collection_name="smartnotes", # πŸ”₯ FIXED
22
  # query=query_vector,
23
+ # limit=top_k,
24
+ # query_filter=Filter(
25
+ # must=[
26
+ # FieldCondition(
27
+ # key="doc_id",
28
+ # match=MatchValue(value=doc_id)
29
+ # )
30
+ # ]
31
+ # )
32
  # )
33
 
34
  # points = results.points
 
44
 
45
 
46
 
 
 
 
 
47
 
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+ # app/core/rag_service.py
57
+ from app.core.embedding_engine import embed_query, COLLECTION_NAME # βœ… use the correct function
58
+ from qdrant_client.http.models import Filter, FieldCondition, MatchValue
59
+ from qdrant_client import QdrantClient
60
+ from app.core.config import QDRANT_URL, QDRANT_API_KEY
61
+
62
+ qdrant_client = QdrantClient(
63
+ url=QDRANT_URL,
64
+ api_key=QDRANT_API_KEY,
65
+ check_compatibility=False
66
+ )
67
+
68
+
69
+ def get_rag_context(query, doc_id, top_k=5): # βœ… top_k=5 for better recall
70
+ query_vector = embed_query(query) # βœ… uses "query: " prefix + returns list
71
+
72
  results = qdrant_client.query_points(
73
+ collection_name=COLLECTION_NAME,
74
  query=query_vector,
75
  limit=top_k,
76
+ score_threshold=0.35, # βœ… filter truly irrelevant results early
77
  query_filter=Filter(
78
+ must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
 
 
 
 
 
79
  )
80
  )
81
 
 
84
  if not points:
85
  return "", [], []
86
 
87
+ context = "\n\n---\n\n".join([p.payload["text"] for p in points]) # βœ… clearer separator
88
+ sources = [p.payload.get("chunk_id", i) for i, p in enumerate(points)]
89
  scores = [p.score for p in points]
90
 
91
  return context, sources, scores
92
 
93
+
94
+
95
+
app/core/text_splitter.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_splitter.py
2
+
3
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+
5
+ splitter = RecursiveCharacterTextSplitter(
6
+ chunk_size=500,
7
+ chunk_overlap=150,
8
+ separators=["\n\n", "\n", ".", " ", ""]
9
+ )
10
+
11
+ def split_text(text):
12
+
13
+ chunks = splitter.split_text(text)
14
+
15
+ # πŸ”₯ CLEANING STEP (VERY IMPORTANT)
16
+ cleaned_chunks = []
17
+ for chunk in chunks:
18
+ chunk = chunk.strip()
19
+ if len(chunk) > 50: # ❌ remove tiny garbage chunks
20
+ cleaned_chunks.append(chunk)
21
+
22
+ return cleaned_chunks