kkthyagharajan commited on
Commit
46f8a04
·
verified ·
1 Parent(s): a776e70

Upload 22 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ doc_ingestion/AIML_Unit1_RMD_ECE.pdf filter=lfs diff=lfs merge=lfs -text
37
+ doc_ingestion/AIML_Unit2_RMD_ECE.pdf filter=lfs diff=lfs merge=lfs -text
38
+ faiss.index filter=lfs diff=lfs merge=lfs -text
39
+ kkt_SQLite_DB.db filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies (OCR support)
6
+ RUN apt-get update && apt-get install -y \
7
+ tesseract-ocr \
8
+ libgl1 \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy all files
12
+ COPY . .
13
+
14
+ # Install Python dependencies
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Expose Hugging Face port
18
+ EXPOSE 7860
19
+
20
+ # Run FastAPI
21
+ CMD ["uvicorn", "kkt_FastAPI_serverV22:app", "--host", "0.0.0.0", "--port", "7860"]
FastAPI_Client/admin.html ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>KKT RAG Admin</title>
5
+ <style>
6
+ body{font-family:Arial;padding:30px}
7
+ table{border-collapse:collapse;width:70%}
8
+ th,td{border:1px solid #ccc;padding:8px}
9
+ button{padding:5px 10px;margin:4px}
10
+ </style>
11
+ </head>
12
+
13
+ <body>
14
+
15
+ <h2>KKT Secure Modular RAG Engine — Admin</h2>
16
+
17
+ <!-- LOGIN FOR ADMIN -->
18
+ <div id="authSection">
19
+ <h3>Admin Login</h3>
20
+ <input type="text" id="adminUser" placeholder="Username">
21
+ <input type="password" id="adminPass" placeholder="Password">
22
+ <button onclick="loginAdmin()">Login</button>
23
+ </div>
24
+
25
+ <!-- ADMIN PANEL -->
26
+ <div id="adminPanel" style="display:none;">
27
+
28
+ <h3>Upload and Index Document</h3>
29
+ <input type="file" id="fileInput">
30
+ <button onclick="uploadDocument()">Upload & Index</button>
31
+
32
+ <br><br>
33
+
34
+ <button onclick="loadDocs()">Refresh Documents</button>
35
+ <button onclick="deleteFolder()">Delete Knowledge</button>
36
+ <button onclick="resetIndex()">Reset Index</button>
37
+
38
+ <br><br>
39
+
40
+ <table id="docTable">
41
+ <tr>
42
+ <th>Document</th>
43
+ <th>Chunks</th>
44
+ <th>Action</th>
45
+ </tr>
46
+ </table>
47
+
48
+ </div>
49
+
50
+ <script src="/static/admin.js?v=1"></script>
51
+ </body>
52
+ </html>
FastAPI_Client/admin.js ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ let token = "";
2
+
3
+ // ------------------ ADMIN LOGIN ------------------
4
+ async function loginAdmin() {
5
+ const username = document.getElementById("adminUser").value;
6
+ const password = document.getElementById("adminPass").value;
7
+
8
+ if (!username || !password) {
9
+ alert("Please enter username and password");
10
+ return;
11
+ }
12
+
13
+ const formData = new URLSearchParams();
14
+ formData.append("username", username);
15
+ formData.append("password", password);
16
+
17
+ try {
18
+ const response = await fetch("/login", {
19
+ method: "POST",
20
+ headers: {"Content-Type":"application/x-www-form-urlencoded"},
21
+ body: formData
22
+ });
23
+
24
+ const data = await response.json();
25
+
26
+ if (response.ok) {
27
+ token = data.access_token;
28
+ localStorage.setItem("token", token);
29
+
30
+ alert("Login Successful");
31
+ document.getElementById("authSection").style.display = "none";
32
+ document.getElementById("adminPanel").style.display = "block";
33
+ loadDocs();
34
+ } else {
35
+ alert(data.detail || "Login Failed");
36
+ }
37
+ } catch (err) {
38
+ console.error(err);
39
+ alert("Login request failed");
40
+ }
41
+ }
42
+
43
+ // ------------------ UPLOAD DOCUMENT ------------------
44
+ async function uploadDocument() {
45
+ const fileInput = document.getElementById("fileInput");
46
+ const file = fileInput.files[0];
47
+
48
+ if (!file) {
49
+ alert("Please select a file");
50
+ return;
51
+ }
52
+
53
+ alert("File is being uploaded and indexed. Please wait ...");
54
+
55
+ const formData = new FormData();
56
+ formData.append("file", file);
57
+
58
+ try {
59
+ const res = await fetch("/admin/upload-document", {
60
+ headers: {"Authorization": "Bearer " + token},
61
+ method: "POST",
62
+ body: formData
63
+ });
64
+
65
+ const data = await res.json();
66
+
67
+ if (data.message) {
68
+ alert(data.message);
69
+ } else if (data.error) {
70
+ alert(data.error);
71
+ }
72
+
73
+ loadDocs();
74
+ } catch (err) {
75
+ console.error(err);
76
+ alert("Upload failed");
77
+ }
78
+ }
79
+
80
+ // ------------------ OTHER EXISTING FUNCTIONS ------------------
81
+ async function loadDocs() {
82
+ try {
83
+ const response = await fetch("/admin/list-documents", {
84
+ headers: {"Authorization": "Bearer " + token}
85
+ });
86
+
87
+ if (!response.ok) throw new Error("Failed to fetch documents");
88
+
89
+ const data = await response.json();
90
+ const table = document.getElementById("docTable");
91
+ if (!table) return;
92
+
93
+ let rows = `<tr><th>Document</th><th>Chunks</th><th>Action</th></tr>`;
94
+
95
+ if (data.documents.length === 0) {
96
+ rows += `<tr><td colspan="3">No documents found</td></tr>`;
97
+ } else {
98
+ data.documents.forEach(doc => {
99
+ rows += `<tr>
100
+ <td><a href="/uploads/${encodeURIComponent(doc.document)}" target="_blank">${doc.document}</a></td>
101
+ <td>${doc.chunks}</td>
102
+ <td><button onclick='deleteDocument(${JSON.stringify(doc.document)}, event)'>Delete</button></td>
103
+ </tr>`;
104
+ });
105
+ }
106
+
107
+ table.innerHTML = rows;
108
+ } catch (err) {
109
+ console.error(err);
110
+ }
111
+ }
112
+
113
+ async function deleteDocument(name, event) {
114
+ if (!confirm("Delete " + name + " ?")) return;
115
+
116
+ try {
117
+ const btn = event.target;
118
+ btn.disabled = true;
119
+ btn.innerText = "Deleting...";
120
+
121
+ const params = new URLSearchParams({ filename: name });
122
+ const res = await fetch(`/admin/delete-document?${params.toString()}`, {
123
+ headers: {"Authorization": "Bearer " + token},
124
+ method: "DELETE"
125
+ });
126
+
127
+ const data = await res.json();
128
+ alert(data.message || "Deleted successfully");
129
+ await loadDocs();
130
+ } catch (err) {
131
+ console.error(err);
132
+ alert("Error deleting file");
133
+ }
134
+ }
135
+
136
+ async function deleteFolder() {
137
+ const folder = prompt("Enter folder name to delete");
138
+ if (!folder) return;
139
+
140
+ await fetch(`/admin/delete-folder?folder=${encodeURIComponent(folder)}`, {
141
+ method: "DELETE",
142
+ headers: {"Authorization": "Bearer " + token}
143
+ });
144
+
145
+ loadDocs();
146
+ }
147
+
148
+ async function resetIndex() {
149
+ if (!confirm("Reset entire index?")) return;
150
+
151
+ await fetch("/admin/reset-index?confirm=true", {
152
+ headers: {"Authorization": "Bearer " + token},
153
+ method: "DELETE"
154
+ });
155
+
156
+ loadDocs();
157
+ }
FastAPI_Client/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+
5
+ <meta charset="UTF-8">
6
+ <title>KKT Secure Modular RAG Engine</title>
7
+
8
+ <link rel="stylesheet" href="/static/style.css">
9
+
10
+ </head>
11
+
12
+ <body>
13
+
14
+ <h1>KKT Secure Modular RAG Engine</h1>
15
+
16
+ <!-- LOGIN / REGISTER -->
17
+
18
+ <div id="authSection">
19
+
20
+ <h2>Register</h2>
21
+
22
+ <input type="text" id="regUser" placeholder="Username">
23
+ <input type="password" id="regPass" placeholder="Password">
24
+ <button onclick="register()">Register</button>
25
+
26
+ <h2>Login</h2>
27
+
28
+ <input type="text" id="loginUser" placeholder="Username">
29
+ <input type="password" id="loginPass" placeholder="Password">
30
+ <button onclick="login()">Login</button>
31
+
32
+ </div>
33
+
34
+
35
+ <!-- MAIN APP -->
36
+
37
+ <div id="mainApp" style="display:none;">
38
+
39
+ <h2>Select Model</h2>
40
+
41
+ <div class="model-row">
42
+ <select id="modelSelect"></select>
43
+ <button onclick="loadModels()">Refresh Models</button>
44
+ </div>
45
+
46
+ <!-- NEW: Model change confirmation message -->
47
+ <div id="modelInfo" style="margin-top:8px;color:green;font-weight:bold;"></div>
48
+
49
+ <div class="ref-section">
50
+
51
+ <label>
52
+ <input type="checkbox" id="refInline">
53
+ <span>Inline Citation</span>
54
+ </label>
55
+
56
+ <label>
57
+ <input type="checkbox" id="refList">
58
+ <span>Bibliography</span>
59
+ </label>
60
+
61
+ </div>
62
+
63
+ <h2>Chat</h2>
64
+
65
+ <div id="chatWindow"></div>
66
+
67
+ <div id="chatControls">
68
+
69
+ <input type="text" id="questionInput" placeholder="Ask a question">
70
+
71
+ <button onclick="sendQuestion()">Send</button>
72
+
73
+ <button onclick="startVoice()">🎤 Voice</button>
74
+
75
+ <button onclick="readChat()">🔈Read</button>
76
+
77
+ <button onclick="stopVoice()">Stop</button>
78
+
79
+ <button onclick="saveChatAsPDF()">📄 Save as PDF</button>
80
+
81
+ </div>
82
+
83
+ </div>
84
+
85
+ <script src="/static/script.js"></script>
86
+
87
+ </body>
88
+ </html>
FastAPI_Client/script.js ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ let token = "";
2
+ let recognition;
3
+ let previousModel = "";
4
+
5
+
6
+ /* ------------------ REGISTER ------------------ */
7
+
8
+ async function register()
9
+ {
10
+ const username = document.getElementById("regUser").value;
11
+ const password = document.getElementById("regPass").value;
12
+
13
+ const response = await fetch("/register",
14
+ {
15
+ method: "POST",
16
+ headers:
17
+ {
18
+ "Content-Type":"application/json"
19
+ },
20
+ body: JSON.stringify(
21
+ {
22
+ username:username,
23
+ password:password
24
+ })
25
+ });
26
+
27
+ const data = await response.json();
28
+
29
+ if(response.ok)
30
+ {
31
+ alert("Successfully Registered");
32
+ }
33
+ else
34
+ {
35
+ alert(data.detail || "Registration failed");
36
+ }
37
+ }
38
+
39
+
40
+ /* ------------------ LOGIN ------------------ */
41
+
42
+ async function login()
43
+ {
44
+
45
+ const username = document.getElementById("loginUser").value;
46
+ const password = document.getElementById("loginPass").value;
47
+
48
+ const formData = new URLSearchParams();
49
+ formData.append("username",username);
50
+ formData.append("password",password);
51
+
52
+ const response = await fetch("/login",
53
+ {
54
+ method:"POST",
55
+ headers:
56
+ {
57
+ "Content-Type":"application/x-www-form-urlencoded"
58
+ },
59
+ body:formData
60
+ });
61
+
62
+ const data = await response.json();
63
+
64
+ if(response.ok)
65
+ {
66
+
67
+ token = data.access_token;
68
+ localStorage.setItem("token", token);
69
+
70
+ alert("Login Successful");
71
+
72
+ document.getElementById("authSection").style.display="none";
73
+ document.getElementById("mainApp").style.display="block";
74
+
75
+ loadModels();
76
+
77
+ }
78
+ else
79
+ {
80
+ alert("Login Failed");
81
+ }
82
+
83
+ }
84
+
85
+
86
+ /* ------------------ LOAD MODELS ------------------ */
87
+ async function loadModels()
88
+ {
89
+
90
+ const select = document.getElementById("modelSelect");
91
+ // 🧠 Refresh warning message
92
+ const modelInfo = document.getElementById("modelInfo");
93
+ if (modelInfo) {
94
+ modelInfo.innerText =
95
+ "🤖 If you click Refresh Models button, Default LLM model will be selected.";
96
+ modelInfo.style.color = "blue";
97
+ }
98
+
99
+ const response = await fetch("/v1/models");
100
+
101
+ const data = await response.json();
102
+
103
+ select.innerHTML="";
104
+
105
+ data.data.forEach(model =>
106
+ {
107
+
108
+ let option = document.createElement("option");
109
+
110
+ option.value = model.id;
111
+
112
+ option.text = model.id;
113
+
114
+ select.appendChild(option);
115
+
116
+ });
117
+
118
+ // ✅ SAFE default selection AFTER population
119
+ if (select.options.length > 0) {
120
+ select.selectedIndex = 0;
121
+ previousModel = select.value;
122
+ }
123
+
124
+ // model change handler
125
+ select.onchange = function () {
126
+ const modelInfo = document.getElementById("modelInfo");
127
+
128
+ if (!modelInfo) return;
129
+
130
+ const message =
131
+ "You switched to model " + select.value +
132
+ ". This model will be used for all new questions.";
133
+
134
+ modelInfo.innerText =
135
+ "🤖 You switched to: " + select.value +
136
+ ". This model will be used for ALL new questions.";
137
+
138
+ modelInfo.style.color = "green";
139
+
140
+ // 🔊 SPEAK MODEL CHANGE
141
+ speak(message);
142
+ };
143
+
144
+ }
145
+
146
+
147
+ /* ------------------ FILE UPLOAD ------------------ */
148
+
149
+ async function uploadFile()
150
+ {
151
+
152
+ const file = document.getElementById("fileInput").files[0];
153
+
154
+ const formData = new FormData();
155
+
156
+ formData.append("file",file);
157
+
158
+ const response = await fetch("/upload",
159
+ {
160
+ method:"POST",
161
+ headers:
162
+ {
163
+ Authorization:"Bearer "+token
164
+ },
165
+ body:formData
166
+ });
167
+
168
+ const data = await response.json();
169
+
170
+ alert(data.message); // shows only the message string
171
+ }
172
+
173
+
174
+
175
+ /* ------------------ CHAT ------------------ */
176
+
177
+ async function sendQuestion()
178
+ {
179
+
180
+ const inputBox = document.getElementById("questionInput");
181
+
182
+ const question = inputBox.value;
183
+
184
+ if(!question.trim()) return;
185
+
186
+ addUserMessage(question);
187
+
188
+ inputBox.value = ""; // clears the input box
189
+
190
+ const model = document.getElementById("modelSelect").value;
191
+
192
+ const inline = document.getElementById("refInline").checked;
193
+ const list = document.getElementById("refList").checked;
194
+
195
+ let refStyle = "both";
196
+ if (inline && !list) refStyle = "inline";
197
+ else if (!inline && list) refStyle = "list";
198
+ else if (!inline && !list) refStyle = "none";
199
+
200
+ const response = await fetch("/v1/chat/completions",
201
+ {
202
+ method:"POST",
203
+ headers:
204
+ {
205
+ "Content-Type":"application/json",
206
+ "Authorization":"Bearer " + token
207
+ },
208
+ body:JSON.stringify(
209
+ {
210
+ model:model,
211
+ reference_style: refStyle,
212
+ messages:[
213
+ {role:"user",content:question}
214
+ ]
215
+ })
216
+ });
217
+
218
+
219
+ const data = await response.json();
220
+
221
+ const answer = data.choices[0].message.content;
222
+ //Clean unwanted model artifacts
223
+ let displayAnswer = answer
224
+ .replace(/\[\d+\]/g, "") // remove [1], [2]
225
+ .replace(/Note: Citation marker.*$/s, "");
226
+
227
+ addBotMessage(displayAnswer);
228
+ let cleanAnswer = answer
229
+ .replace(/<[^>]*>/g, "") //
230
+ .replace(/\([^)]*\.pdf[^)]*\)/gi, "")
231
+ .replace(/\[\d+\]/g, "")
232
+ .replace(/Note: Citation marker.*$/s, "")
233
+ .replace(/\s+/g, " ") //
234
+ .trim();
235
+
236
+ cleanAnswer = cleanAnswer.split("References:")[0];
237
+
238
+ speak(cleanAnswer);
239
+ }
240
+
241
+ document.addEventListener("DOMContentLoaded", function() {
242
+
243
+ document.getElementById("questionInput").addEventListener("keydown", function(event) {
244
+ if (event.key === "Enter") {
245
+ event.preventDefault();
246
+ sendQuestion();
247
+ }
248
+ });
249
+
250
+ });
251
+
252
+
253
+ /* ------------------ CHAT DISPLAY ------------------ */
254
+
255
+ function addUserMessage(text)
256
+ {
257
+
258
+ const chat = document.getElementById("chatWindow");
259
+
260
+ const div = document.createElement("div");
261
+
262
+ div.className="userMessage";
263
+
264
+ div.innerText=text;
265
+
266
+ chat.appendChild(div);
267
+
268
+ }
269
+
270
+
271
+ function addBotMessage(text)
272
+ {
273
+
274
+ const chat = document.getElementById("chatWindow");
275
+
276
+ const div = document.createElement("div");
277
+
278
+ div.className="botMessage";
279
+
280
+ div.innerHTML = text;
281
+
282
+ chat.appendChild(div);
283
+
284
+ }
285
+
286
+
287
+ /* ------------------ VOICE INPUT ------------------ */
288
+
289
+ function startVoice()
290
+ {
291
+
292
+ recognition = new webkitSpeechRecognition();
293
+
294
+ recognition.lang="en-US";
295
+
296
+ recognition.onresult=function(event)
297
+ {
298
+ document.getElementById("questionInput").value = event.results[0][0].transcript;
299
+ };
300
+
301
+ recognition.start();
302
+
303
+ }
304
+
305
+ function stopVoice()
306
+ {
307
+
308
+ if(recognition)
309
+ {
310
+ recognition.stop();
311
+ }
312
+
313
+ speechSynthesis.cancel();
314
+ }
315
+
316
+
317
+ /* ------------------ VOICE OUTPUT ------------------ */
318
+
319
+ function speak(text)
320
+ {
321
+
322
+ //STOP any previous speech to avoid vrvrvr noise
323
+ speechSynthesis.cancel();
324
+
325
+ const speech = new SpeechSynthesisUtterance(text);
326
+
327
+ speech.lang="en-US";
328
+
329
+ speechSynthesis.speak(speech);
330
+
331
+ }
332
+
333
+ function readChat()
334
+ {
335
+
336
+ const messages = document.getElementsByClassName("botMessage");
337
+
338
+ if(messages.length === 0) return;
339
+
340
+ const lastMessage = messages[messages.length - 1];
341
+
342
+ const chatText = lastMessage.innerText;
343
+
344
+ //Remove inline references from speaking
345
+ let cleanText = chatText
346
+ .replace(/\([^)]*\.pdf[^)]*\)/gi, "") // remove full inline refs
347
+ .replace(/\[\d+\]/g, ""); // remove [1], [2]
348
+
349
+ //Remove References section from speaking
350
+ cleanText = cleanText.split("References:")[0];
351
+
352
+ speak(cleanText);
353
+
354
+ }
355
+
356
+ function saveChatAsPDF()
357
+ {
358
+ const chatWindow = document.getElementById("chatWindow");
359
+
360
+ const originalContent = chatWindow.innerHTML;
361
+
362
+ const printWindow = window.open('', '', 'width=800,height=600');
363
+
364
+ printWindow.document.write(`
365
+ <html>
366
+ <head>
367
+ <title>Chat Export</title>
368
+ <style>
369
+ body { font-family: Arial; padding: 20px; }
370
+ .userMessage { color: blue; margin: 5px 0; }
371
+ .botMessage { color: black; margin: 5px 0; }
372
+ </style>
373
+ </head>
374
+ <body>
375
+ <h2>Chat History</h2>
376
+ ${originalContent}
377
+ </body>
378
+ </html>
379
+ `);
380
+
381
+ printWindow.document.close();
382
+ printWindow.print();
383
+ }
FastAPI_Client/style.css ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body
2
+ {
3
+ font-family: Arial;
4
+ background: #f4f4f4;
5
+ margin: 40px;
6
+ }
7
+
8
+ h1
9
+ {
10
+ text-align: center;
11
+ }
12
+
13
+ #authSection
14
+ {
15
+ background: white;
16
+ padding: 20px;
17
+ width: 400px;
18
+ margin: auto;
19
+ border-radius: 8px;
20
+ }
21
+
22
+ #mainApp
23
+ {
24
+ margin-top: 30px;
25
+ }
26
+
27
+ input
28
+ {
29
+ width: 100%;
30
+ padding: 10px;
31
+ margin-top: 10px;
32
+ font-size: 18px; /* 🔥 increase input text size */
33
+ }
34
+
35
+ button
36
+ {
37
+ padding: 10px;
38
+ margin-top: 10px;
39
+ cursor: pointer;
40
+ }
41
+
42
+ #chatWindow
43
+ {
44
+ height: 400px;
45
+ background: white;
46
+ overflow-y: auto;
47
+ padding: 10px;
48
+ border-radius: 8px;
49
+ }
50
+
51
+ .userMessage
52
+ {
53
+ text-align:left;
54
+ color:blue;
55
+ margin:5px;
56
+ font-family:Calibri;
57
+ font-size:28px;
58
+ }
59
+
60
+ .botMessage
61
+ {
62
+ text-align: left;
63
+ color:black;
64
+ margin:5px;
65
+ font-family:Arial;
66
+ font-size:20px;
67
+ }
68
+
69
+ #chatControls
70
+ {
71
+ margin-top: 10px;
72
+ display: flex;
73
+ gap: 10px;
74
+ }
75
+
76
+ #chatControls
77
+ {
78
+ margin-top: 10px;
79
+ display: flex;
80
+ gap: 10px;
81
+ }
82
+
83
+ /* ===== Reference checkbox alignment fix ===== */
84
+ .ref-section
85
+ {
86
+ display: flex;
87
+ align-items: center;
88
+ gap: 20px;
89
+ margin-top: 25px; /* 🔥 move downward */
90
+ flex-wrap: nowrap; /* 🔥 force single row */
91
+ }
92
+
93
+ .ref-section label
94
+ {
95
+ display: flex;
96
+ align-items: center;
97
+ gap: 8px;
98
+ white-space: nowrap; /* ===== Prevent label text from wrapping ===== */
99
+ }
100
+
101
+ /* ===== Fine-tune checkbox vertical position ===== */
102
+ .ref-section input[type="checkbox"]
103
+ {
104
+ margin: 0;
105
+ transform: translateY(-1px); /* 🔥 move checkbox UP */
106
+ }
107
+
108
+ .model-row
109
+ {
110
+ display: flex;
111
+ gap: 10px;
112
+ align-items: center; /* 🔥 fix alignment */
113
+ margin-top: -15px;
114
+ }
115
+
116
+ .model-row select
117
+ {
118
+ width: 250px; /* 🔥 fixed reasonable width */
119
+ height: 36px; /* 🔥 controls actual height */
120
+ font-size: 16px;
121
+ }
122
+
123
+ .model-row button
124
+ {
125
+ height: 36px;
126
+ padding: 0 15px;
127
+ white-space: nowrap;
128
+ display: flex; /* 🔥 important */
129
+ align-items: center; /* 🔥 vertical centering */
130
+ }
131
+
132
+
133
+ /* ===== Fine-tune dropdown vertical position ===== */
134
+ #modelSelect
135
+ {
136
+ transform: translateY(2px); /* 🔥 move DOWN */
137
+ }
138
+
139
+
140
+ /* ===== Slightly bigger checkbox ===== */
141
+ .ref-section input[type="checkbox"]
142
+ {
143
+ transform: scale(1.5); /* 🔥 increase size */
144
+ margin: 0;
145
+ }
READMEmodify.md ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 FastAPI RAG Server (V22)
2
+
3
+ A production-ready Retrieval-Augmented Generation (RAG) API built using FastAPI. This project enables document ingestion, semantic search using embeddings, and LLM-based question answering.
4
+
5
+ ---
6
+
7
+ ## 📌 Features
8
+
9
+ - 📄 PDF document ingestion and chunking
10
+ - 🧹 Text cleaning pipeline
11
+ - 🔎 Semantic search using FAISS
12
+ - 🤖 LLM integration (Groq / OpenAI compatible API)
13
+ - 🔐 Authentication with hashed passwords (Passlib + JWT)
14
+ - ⚡ FastAPI async endpoints
15
+
16
+ ---
17
+
18
+ ## 🏗️ Project Structure
19
+
20
+ ```
21
+ .
22
+ ├── kkt_FastAPI_serverV22.py # Main FastAPI server
23
+ ├── chunker.py # Document loading & chunking
24
+ ├── text_cleanerV2.py # Text preprocessing pipeline
25
+ ├── indexer.py # Embedding + FAISS index
26
+ ├── requirements.txt # Dependencies
27
+ └── README.md
28
+ ```
29
+
30
+ ---
31
+
32
+ ## ⚙️ Installation
33
+
34
+ ### 1. Clone the repository
35
+ ```bash
36
+ git clone <your-repo-url>
37
+ cd <repo-folder>
38
+ ```
39
+
40
+ ### 2. Create environment (recommended)
41
+ ```bash
42
+ conda create -n rag_env python=3.11
43
+ conda activate rag_env
44
+ ```
45
+
46
+ ### 3. Install dependencies
47
+ ```bash
48
+ pip install -r requirements.txt
49
+ ```
50
+
51
+ ### Required dependencies
52
+ Ensure the following are present:
53
+ ```
54
+ fastapi
55
+ uvicorn
56
+ passlib[bcrypt]
57
+ python-jose[cryptography]
58
+ sentence-transformers
59
+ faiss-cpu
60
+ httpx
61
+ ```
62
+
63
+ ---
64
+
65
+ ## 🔑 Environment Variables
66
+
67
+ Set your API keys before running:
68
+
69
+ ```bash
70
+ export GROQ_API_KEY="your_api_key_here"
71
+ ```
72
+
73
+ On Windows PowerShell:
74
+ ```powershell
75
+ $env:GROQ_API_KEY="your_api_key_here"
76
+ ```
77
+
78
+ ---
79
+
80
+ ## ▶️ Running the Server
81
+
82
+ ```bash
83
+ uvicorn kkt_FastAPI_serverV22:app --host 0.0.0.0 --port 8000
84
+ ```
85
+
86
+ Access API docs:
87
+
88
+ 👉 http://localhost:8000/docs
89
+
90
+ ---
91
+
92
+ ## 🔄 Workflow
93
+
94
+ 1. Upload documents (PDF)
95
+ 2. Chunk and clean text
96
+ 3. Generate embeddings using SentenceTransformers
97
+ 4. Store vectors in FAISS
98
+ 5. Query → retrieve relevant chunks
99
+ 6. Send context to LLM → generate answer
100
+
101
+ ---
102
+
103
+ ## 🔐 Authentication
104
+
105
+ - Password hashing: Passlib (bcrypt)
106
+ - Token system: JWT (python-jose)
107
+
108
+ Example flow:
109
+ 1. Register user
110
+ 2. Login → receive token
111
+ 3. Use token in protected endpoints
112
+
113
+ ---
114
+
115
+ ## 🐳 Docker (Optional)
116
+
117
+ ### Build image
118
+ ```bash
119
+ docker build -t rag-fastapi .
120
+ ```
121
+
122
+ ### Run container
123
+ ```bash
124
+ docker run -p 8000:8000 rag-fastapi
125
+ ```
126
+
127
+ ---
128
+
129
+ ## ⚠️ Common Issues
130
+
131
+ ### ❌ ModuleNotFoundError: passlib
132
+ Fix:
133
+ ```bash
134
+ pip install passlib[bcrypt]
135
+ ```
136
+
137
+ ### ❌ API timeout issues
138
+ - Increase timeout in `httpx.AsyncClient`
139
+ - Check API key validity
140
+
141
+ ---
142
+
143
+ ## 📈 Future Improvements
144
+
145
+ - Streaming responses
146
+ - Multi-document indexing
147
+ - Role-based authentication
148
+ - UI integration (React/Streamlit)
149
+
150
+ ---
151
+
152
+ ## 🧠 Tech Stack
153
+
154
+ - FastAPI
155
+ - FAISS
156
+ - SentenceTransformers
157
+ - Passlib
158
+ - JWT (python-jose)
159
+ - httpx
160
+
161
+ ---
162
+
163
+ ## 📄 License
164
+
165
+ MIT License
166
+
167
+ ---
168
+
169
+ ## 👨‍💻 Author
170
+
171
+ Developed by Thyagharajan K K
172
+
app.py ADDED
@@ -0,0 +1,1019 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Feb 20 13:39:23 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+ # In[]
8
+ #NOTE
9
+ #if you change the file name kkt_FastAPI_serverV21 change the same in core_imports.py file
10
+ #All uploaded files are saved in the doc_ingestion folder where the server file exixsts
11
+ # In[]
12
+
13
+ """
14
+ orchestrator layer
15
+ (torch26_cu124_trans_unsloth) D: cd kkt_secure_modular_rag_engine
16
+ uvicorn kkt_FastAPI_server:app --host 0.0.0.0 --port 8000
17
+ on Browser: http://localhost:3009
18
+ V1 works well when selected llama3:8b
19
+ V2 works with WebUI, LLM Model selecttion, to add pdf files, system prompt, RAG prompt at WebUI are intergrated
20
+ but Halusination could not be controlled and some layers could not communicate with WebUI
21
+ Don't delete V2'
22
+ V3 server uses separate FastAPI Swagard client. Working
23
+ V4 HTML client is used. Ollama output is not obtained
24
+ V5 LLM provides output without references.
25
+ V6 References included
26
+ V7 References and Citations improved
27
+ V8 User interface was improved and tested
28
+ V9 doc_ingestion folder and its functionality was changed for opening the reference files on the browser.
29
+ But chat window didn't show any link
30
+ V10 Inline reference was modified. working
31
+ V11 facility needed to delete the knowledge were added but not tested completely
32
+ V12 same upload_file used for admin. Single ingestion system. does chunking,embedding,FAISS update,DB storage. Many modifications done.
33
+ V13 Changes made but not checked
34
+ V14 checked working well
35
+ V15 Reference modes are working well. Still at the end it reads vr vr vr
36
+ V16 speaking vr vr vr at the end was removed in the script.js and working fine.
37
+ Bibiliography refrences may be different from inline citation, because all bibiliography are not used by LLM. Checked working
38
+ V17 References are properly displayed on all the three modes. Checked OK
39
+ V18 File upload UI for user (in index.html) was removed.
40
+ V19 UI was corrected. Refrences were corrected. working fine.
41
+ V20 Save as PDF button was added in the index and script files
42
+ V21 if you change the file name kkt_FastAPI_serverV21 change the same in core_imports.py file
43
+ V22 Uses Groq, API key and cloud for deploying on Hugging Face
44
+ """
45
+
46
+ # In[]
47
+ import os
48
+ import re
49
+ import httpx
50
+ import threading
51
+ from fastapi import FastAPI, HTTPException, UploadFile, File
52
+ #FastAPI application serves as a server for webui and client for local Ollama. This is to be done asynchronouly
53
+ #So, the app acts as a proxy to Ollama
54
+ from pydantic import BaseModel
55
+ from typing import List, Optional
56
+ from datetime import datetime
57
+ import sqlite3
58
+ from passlib.context import CryptContext
59
+ from jose import JWTError, jwt
60
+ from fastapi import Depends
61
+ from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
62
+ import numpy as np
63
+ import faiss
64
+
65
+
66
+
67
+ from sentence_transformers import SentenceTransformer
68
+ from rag.chunker import DocChunker #import class
69
+ #from rag.indexer import build_vector_index #imports function
70
+ import requests
71
+
72
+ from fastapi.responses import FileResponse
73
+ from fastapi.staticfiles import StaticFiles
74
+ from fastapi.middleware.cors import CORSMiddleware
75
+
76
+
77
+
78
+ import shutil
79
+ from pathlib import Path
80
+
81
+ from utils.text_cleanerV2 import clean_text
82
+ from utils.admin_fns import router as admin_router
83
+
84
+
85
+ #from rag.retriever_factory import get_retriever
86
+ '''
87
+ from rag.citation_validator import validate_citations
88
+ from rag.hallucination_control import apply_confidence_filter
89
+ from rag.permission_gate import check_external_access
90
+ '''
91
+
92
+
93
+ from models.model_config import VECTOR_BACKEND, INDEX_PATH
94
+
95
+ # JWT Configuration
96
+ SECRET_KEY = "KKT_SUPER_SECRET_KEY_CHANGE_THIS"
97
+ ALGORITHM = "HS256"
98
+ ACCESS_TOKEN_EXPIRE_MINUTES = 60
99
+
100
+ #UPLOAD_FOLDER = "uploads"
101
+ EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
102
+ #EMBEDDING_MODEL → encodes text to vector
103
+
104
+ VECTOR_INDEX = None
105
+ #VECTOR_INDEX → FAISS index storing vectors
106
+ INDEX_READY = False #Used if query is given before the vector rebuilding is not completed after the app starts
107
+
108
+ pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
109
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="login")
110
+
111
+ # In[]
112
+ '''
113
+ # =====================================
114
+ # 🔐 Allowed Models (Ollama)
115
+ # =====================================
116
+ ALLOWED_MODELS = [
117
+ "llama3:8b",
118
+ "llama3.1:8b",
119
+ "phi3:mini",
120
+ "Phi-3 Medium",
121
+ "mistral",
122
+ "qwen2.5:7b",
123
+ "deepseek-r1:7b",
124
+ "llama3.2:3b"
125
+ ]
126
+ #
127
+ # Default model (if WebUI sends None)
128
+ DEFAULT_MODEL = "deepseek-r1:7b"
129
+ '''
130
+
131
+
132
+ # =====================================
133
+ # 🔐 Allowed Models (Groq)
134
+ # =====================================
135
+ ALLOWED_MODELS = [
136
+ "llama-3.1-8b-instant",
137
+ "llama-3.3-70b-versatile",
138
+ "mixtral-8x7b-32768",
139
+ "gemma-7b-it"
140
+ ]
141
+
142
+ DEFAULT_MODEL = "llama-3.1-8b-instant"
143
+ #GROQ_API_KEY -- Environment variable was set using setx GROQ_API_KEY "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"
144
+ #in Windows Powershell
145
+
146
+ #Ollama endpoint (since FastAPI runs on Windows host)
147
+ #OLLAMA_URL = "http://localhost:11434/api/chat" #Hugging Face Spaces does NOT support Ollama Delete this not needed
148
+ # In[] SERVER-SIDE DOCUMNET REPOSITORY (FOR REFERENCE)
149
+ from config import UPLOAD_FOLDER, DB_PATH_FILE, FAISS_INDEX_PATH
150
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
151
+
152
+ FAISS_LOCK = threading.Lock() #Ensure safe concurrent access. Prevent index corruption.
153
+
154
+ # In[] # FASTAPI SERVER
155
+
156
+ app = FastAPI(
157
+ title="KKT Secure Modular RAG Engine V11",
158
+ version="11.0.0",
159
+ description="Secure modular RAG pipeline with authentication, chunking, FAISS retrieval"
160
+ )
161
+ #returns an ASGI (Asynchronous Server Gateway Interface) compatibe app instance
162
+ #This app can be run by uvicorn server
163
+ #Title will be shown in the doc page of http://localhost:8000/docs
164
+ #app = FastAPI() #without metadata
165
+
166
+ app.add_middleware(
167
+ CORSMiddleware,
168
+ allow_origins=["*"],
169
+ allow_credentials=True,
170
+ allow_methods=["*"],
171
+ allow_headers=["*"],
172
+ )
173
+ #configures CORS (Cross-Origin Resource Sharing) of the app
174
+
175
+
176
+ # UI files. Maps URL prefix /static → local folder FastAPI_Client
177
+ app.mount("/static", StaticFiles(directory="FastAPI_Client"), name="static")
178
+ #URL: /static/app.js will be mapped to the File: FastAPI_Client/app.js
179
+
180
+ # uploaded documents
181
+ app.mount("/uploads", StaticFiles(directory=UPLOAD_FOLDER), name="uploads") #uploaded files will be saved in doc_ingestion
182
+ app.include_router(admin_router) #registeres all admin endpoints with FastAPI
183
+ #UPLOAD_FOLDER = /kkt_secure_modular_rag_engine/doc_ingestion
184
+ #URL: /uploads/file1.pdf
185
+ #File: /kkt_secure_modular_rag_engine/doc_ingestion/file1.pdf
186
+ #uploaded documents are saved in the doc_ingestion folder but , when they are opened in the browser with folder name defined here
187
+ #It will not expose the actual folder in the server http://localhost:8000/uploads/AI_book.pdf
188
+
189
+ # In[] Admin Credentials
190
+
191
+ # Fixed admin user
192
+ ADMIN_USERNAME = "admin"
193
+ ADMIN_PASSWORD = "Tnivedha@123" # plain password (for testing)
194
+ pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
195
+ ADMIN_PASSWORD_HASH = pwd_context.hash(ADMIN_PASSWORD)
196
+
197
+ def authenticate_admin(username: str, password: str):
198
+ if username == ADMIN_USERNAME and verify_password(password, ADMIN_PASSWORD_HASH):
199
+ return {"username": username} # returns user info for token
200
+ return None
201
+
202
+
203
+ # In[]
204
+ # =====================================
205
+ # 🔐 Current User Dependency
206
+ # =====================================
207
+ def get_current_user(token: str = Depends(oauth2_scheme)):
208
+ try:
209
+ payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
210
+ username: str = payload.get("sub")
211
+ if username is None:
212
+ raise HTTPException(status_code=401, detail="Invalid token")
213
+ return username
214
+ except JWTError:
215
+ raise HTTPException(status_code=401, detail="Invalid token")
216
+
217
+
218
+ # In[] Data Base Tables
219
+
220
+ #Create SQLite connectection to the database file. Used in many places to connect to database
221
+ def get_db():
222
+ return sqlite3.connect(DB_PATH_FILE, check_same_thread=False)
223
+
224
+
225
+ # =====================================
226
+ # 🗄 SQLite Database Setup
227
+ # =====================================
228
+ def init_db():
229
+ conn = get_db() #connect DB file
230
+ cursor = conn.cursor()
231
+
232
+ # Users table (existing)
233
+ cursor.execute("""
234
+ CREATE TABLE IF NOT EXISTS users (
235
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
236
+ username TEXT UNIQUE NOT NULL,
237
+ hashed_password TEXT NOT NULL,
238
+ created_at TEXT NOT NULL
239
+ )
240
+ """)
241
+
242
+ # NEW: Chunk metadata table
243
+ cursor.execute("""
244
+ CREATE TABLE IF NOT EXISTS document_chunks (
245
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
246
+ faiss_id INTEGER,
247
+ source TEXT NOT NULL,
248
+ path TEXT,
249
+ page INTEGER NOT NULL,
250
+ text TEXT NOT NULL,
251
+ created_at TEXT NOT NULL
252
+ )
253
+ """)
254
+
255
+ conn.commit()
256
+ conn.close()
257
+
258
+ init_db() #Database is initialized
259
+
260
+ '''
261
+ Explanation
262
+ faiss_id : 525
263
+ source : "Unit 1 AI & Python Complete.pdf" only file name is stored
264
+ path : "D:/.../doc_ingestion/Unit 1 AI & Python Complete.pdf"
265
+ text : "Artificial Intelligence is..."
266
+ page : 7
267
+ created_at : timestamp
268
+
269
+ Each real chunk will look like
270
+ {
271
+ "source": "Unit 1 AI & Python Complete.pdf",
272
+ "text": "Artificial Intelligence is...",
273
+ "page": 7
274
+ }
275
+
276
+ After embedding, FAISS contains
277
+ FAISS ID --> vector(one chunk text)
278
+ '''
279
+
280
+ # In[]
281
+
282
+ # ========================================================
283
+ # 🔐Store and Retrieve Chunks via SQLite
284
+ # ========================================================
285
+ def store_chunks_in_db(chunks, faiss_ids):
286
+ conn = get_db() #connect DB file
287
+ cursor = conn.cursor()
288
+
289
+ for chunk, fid in zip(chunks, faiss_ids):
290
+ #chunks - list of dictionories with [{"source": "file1.pdf", "text": "...", "page": 3}]
291
+ #faiss_ids - each ID corresponds to one embedding vector.
292
+ #zip creates a pairing between the two lists chunks and faiss_ids:
293
+ cursor.execute("""
294
+ INSERT INTO document_chunks
295
+ (faiss_id, source, path, text, page, created_at)
296
+ VALUES (?, ?, ?, ?, ?, datetime('now'))
297
+ """, (
298
+ fid,
299
+ chunk["source"],
300
+ f"{UPLOAD_FOLDER}/{chunk['source']}",
301
+ chunk["text"],
302
+ chunk.get("page", 0)
303
+ ))
304
+
305
+ conn.commit()
306
+ conn.close()
307
+
308
+
309
+ def get_next_faiss_id():
310
+ #generates the next unique FAISS ID based on what’s already stored in the database.
311
+ #MAX(faiss_id)- Scans column faiss_id and Returns the largest value
312
+ #returns 0 if table is empty otherwise max_id + 1
313
+
314
+ conn = get_db() #connect DB file
315
+ cursor = conn.cursor()
316
+
317
+ cursor.execute("SELECT MAX(faiss_id) FROM document_chunks")
318
+ result = cursor.fetchone()[0]
319
+ #fetchone() returns a tuple (max_value,)
320
+ conn.close()
321
+
322
+ if result is None:
323
+ return 0
324
+
325
+ return result + 1
326
+
327
+
328
+ def fetch_chunks_by_faiss_ids(faiss_ids):
329
+
330
+ conn = get_db() #connect DB file
331
+ cursor = conn.cursor()
332
+
333
+ placeholders = ",".join(["?"] * len(faiss_ids))
334
+ #if faiss_ids = [101, 205, 87] placeholders = "?,?,?" this is needed because SQL expects 3 values
335
+ #FAISS returns IDs in ranked order (most relevant first). But SQL does NOT preserve order. It may return wrong oreder [101, 205, 87]
336
+ query = f"""
337
+ SELECT faiss_id, text, source, page
338
+ FROM document_chunks
339
+ WHERE faiss_id IN ({placeholders})
340
+ """
341
+
342
+ cursor.execute(query, faiss_ids)
343
+ rows = cursor.fetchall()
344
+
345
+ '''
346
+ rows will be list of tuples as given below
347
+ [
348
+ (101, "text1", "file1.pdf", 2),
349
+ (205, "text2", "file2.pdf", 5),
350
+ ]
351
+ '''
352
+ conn.close()
353
+
354
+ results = []
355
+ #{ faiss_id → row_data }
356
+ id_to_row = {
357
+ row[0]: {
358
+ "faiss_id": row[0],
359
+ "text": row[1],
360
+ "source": row[2],
361
+ "page": row[3]
362
+ }
363
+ for row in rows
364
+ }
365
+
366
+ results = [id_to_row[fid] for fid in faiss_ids if fid in id_to_row] #reorder to preserve FAISS order
367
+
368
+ return results
369
+
370
+
371
+ def retrieve_relevant_chunks(query, top_k=5):
372
+ #FAISS retrives faiss id and SQLite fetches metadata from that id
373
+ global VECTOR_INDEX, EMBEDDING_MODEL, INDEX_READY
374
+ #EMBEDDING_MODEL → encodes text to vector
375
+ #VECTOR_INDEX → FAISS index storing vectors
376
+ #INDEX_READY -> used to handle thesituation when vectors are unbulit at the starting of the app and query is given
377
+ if not INDEX_READY or VECTOR_INDEX is None:
378
+ raise HTTPException(status_code=503, detail="Index is still building") # raise HTTPException
379
+ if EMBEDDING_MODEL is None:
380
+ raise HTTPException(status_code=500, detail="Embedding model not loaded")
381
+ # Encode query. Does not normalize embeddings. So, FAISS uses L2 distance normally not cosine similarity equivalent
382
+ query_embedding = EMBEDDING_MODEL.encode([query])
383
+ query_embedding = np.array(query_embedding).astype("float32")
384
+
385
+ # FAISS search
386
+ with FAISS_LOCK:
387
+ k = min(top_k, VECTOR_INDEX.ntotal)
388
+ distances, indices = VECTOR_INDEX.search(query_embedding, top_k)
389
+ print("FAISS distances:", distances)
390
+ print("FAISS indices:", indices)
391
+
392
+ #To avoid potential crash in empty FAISS search
393
+ if indices is None or len(indices[0]) == 0:
394
+ return []
395
+
396
+ faiss_ids = [int(i) for i in indices[0] if i != -1]
397
+
398
+ # Fetch metadata from SQLite
399
+ retrieved_chunks = fetch_chunks_by_faiss_ids(faiss_ids)
400
+
401
+ return retrieved_chunks
402
+
403
+ # ========================================================
404
+ # 🔄 Rebuild FAISS Index From SQLite on Server Start
405
+ # ========================================================
406
+
407
+ def rebuild_faiss_index():
408
+ global VECTOR_INDEX, EMBEDDING_MODEL, INDEX_READY
409
+
410
+ with FAISS_LOCK:
411
+
412
+ INDEX_READY = False
413
+
414
+ conn = get_db()
415
+ cursor = conn.cursor()
416
+
417
+ cursor.execute("""
418
+ SELECT faiss_id, text
419
+ FROM document_chunks
420
+ ORDER BY faiss_id
421
+ """)
422
+
423
+ rows = cursor.fetchall()
424
+ conn.close()
425
+
426
+ if not rows:
427
+ VECTOR_INDEX = None
428
+ INDEX_READY = True
429
+
430
+ if os.path.exists(FAISS_INDEX_PATH):
431
+ os.remove(FAISS_INDEX_PATH)
432
+
433
+ print("No documents found. FAISS cleared and file removed.")
434
+ return
435
+
436
+ texts = [row[1] for row in rows]
437
+ ids = [row[0] for row in rows]
438
+
439
+ embeddings = EMBEDDING_MODEL.encode(texts)
440
+ embeddings = np.array(embeddings).astype("float32")
441
+
442
+ dimension = embeddings.shape[1]
443
+
444
+ base_index = faiss.IndexFlatL2(dimension)
445
+ VECTOR_INDEX = faiss.IndexIDMap(base_index)
446
+
447
+ VECTOR_INDEX.add_with_ids(
448
+ embeddings,
449
+ np.array(ids, dtype="int64")
450
+ )
451
+
452
+ INDEX_READY = True
453
+
454
+ print(f"FAISS index rebuilt with {len(texts)} chunks.")
455
+
456
+ faiss.write_index(VECTOR_INDEX, FAISS_INDEX_PATH)
457
+ print("FAISS index saved to disk.")
458
+ INDEX_READY = True
459
+
460
+ # In[]
461
+
462
+ @app.on_event("startup")
463
+ def startup_event():
464
+ print("Server started successfully")
465
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
466
+ threading.Thread(target=rebuild_faiss_index).start()
467
+
468
+ # In[]
469
+ # ============================================
470
+ # Save uploaded file
471
+ # ============================================
472
+ async def save_file(file: UploadFile):
473
+ filename = Path(file.filename).name
474
+ file_path = os.path.join(UPLOAD_FOLDER, filename)
475
+
476
+ if os.path.exists(file_path):
477
+ raise HTTPException(
478
+ status_code=400,
479
+ detail="File already exists. Please rename or delete the existing file."
480
+ )
481
+
482
+ with open(file_path, "wb") as buffer:
483
+ while chunk := await file.read(1024 * 1024):
484
+ buffer.write(chunk)
485
+
486
+ await file.seek(0) # reset pointer (important)
487
+
488
+ return file_path
489
+
490
+
491
+ # In[]
492
+
493
+ # =====================================
494
+ # 👤 User Registration
495
+ # =====================================
496
+ class UserRegister(BaseModel):
497
+ username: str
498
+ password: str
499
+
500
+ @app.post("/register")
501
+ def register(user: UserRegister):
502
+ conn = get_db()
503
+ cursor = conn.cursor()
504
+
505
+ hashed_pw = hash_password(user.password)
506
+
507
+ try:
508
+ cursor.execute(
509
+ "INSERT INTO users (username, hashed_password, created_at) VALUES (?, ?, ?)",
510
+ (user.username, hashed_pw, datetime.now().isoformat())
511
+ )
512
+ conn.commit()
513
+ except sqlite3.IntegrityError:
514
+ conn.close()
515
+ raise HTTPException(status_code=400, detail="Username already exists")
516
+
517
+ conn.close()
518
+ return {"message": "User registered successfully"}
519
+
520
+
521
+ # =====================================
522
+ # 🔑 Login Endpoint
523
+ # =====================================
524
+
525
+ @app.post("/login")
526
+ def login(form_data: OAuth2PasswordRequestForm = Depends()):
527
+ # --- Check hardcoded admin first ---
528
+ if form_data.username == ADMIN_USERNAME and verify_password(form_data.password, ADMIN_PASSWORD_HASH):
529
+ access_token = create_access_token(data={"sub": ADMIN_USERNAME})
530
+ return {"access_token": access_token, "token_type": "bearer"}
531
+
532
+ # --- Otherwise fallback to database users ---
533
+ conn = get_db()
534
+ cursor = conn.cursor()
535
+ cursor.execute(
536
+ "SELECT id, username, hashed_password FROM users WHERE username = ?",
537
+ (form_data.username,)
538
+ )
539
+ user = cursor.fetchone()
540
+ conn.close()
541
+
542
+ if not user:
543
+ raise HTTPException(status_code=400, detail="Invalid credentials")
544
+
545
+ user_id, username, hashed_password = user
546
+
547
+ if not verify_password(form_data.password, hashed_password):
548
+ raise HTTPException(status_code=400, detail="Invalid credentials")
549
+
550
+ access_token = create_access_token(data={"sub": username})
551
+ return {"access_token": access_token, "token_type": "bearer"}
552
+
553
+
554
+ # =======================================================================
555
+ # 🔐 Upload Files Using FastAPI User Interface and Split in to Chunks
556
+ # =======================================================================
557
+ @app.post("/upload")
558
+ async def upload_file(file: UploadFile = File(...),current_user: str = Depends(get_current_user)):
559
+ #registers the endpoint, upload → upload_file() in a routing table
560
+ #Uses FAISS logic,SQLite logic, chunking logic
561
+ #Single ingestion system. does chunking,embedding,FAISS update,DB storage
562
+
563
+ global VECTOR_INDEX, EMBEDDING_MODEL, INDEX_READY
564
+
565
+ file_path = await save_file(file)
566
+ filename = Path(file.filename).name.strip() #new_chunks = [c for c in all_chunks if c["source"].strip() == filename]handle & space etc
567
+ # Chunk documents
568
+ chunker = DocChunker(doc_folder=UPLOAD_FOLDER)
569
+ all_chunks = chunker.chunk_documents()
570
+
571
+ # Only new file chunks
572
+ new_chunks = [c for c in all_chunks if c["source"].strip() == filename]
573
+
574
+ if not new_chunks:
575
+ return {"message": "No text extracted from document."}
576
+ #return the above to client which called this function as a JSON with message
577
+
578
+ new_texts = [clean_text(chunk["text"]) for chunk in new_chunks]
579
+
580
+ # Encode new chunks
581
+ new_embeddings = EMBEDDING_MODEL.encode(new_texts)
582
+ new_embeddings = np.array(new_embeddings).astype("float32")
583
+
584
+ # Determine FAISS ids using SQLite
585
+ start_id = get_next_faiss_id()
586
+ faiss_ids = list(range(start_id, start_id + len(new_embeddings)))
587
+
588
+
589
+ #Update FAISS FIRST
590
+ with FAISS_LOCK:
591
+ if VECTOR_INDEX is None:
592
+ dimension = new_embeddings.shape[1]
593
+ base_index = faiss.IndexFlatL2(dimension)
594
+ VECTOR_INDEX = faiss.IndexIDMap(base_index)
595
+
596
+ VECTOR_INDEX.add_with_ids(
597
+ new_embeddings,
598
+ np.array(faiss_ids, dtype="int64")
599
+ )
600
+ faiss.write_index(VECTOR_INDEX, FAISS_INDEX_PATH)
601
+ INDEX_READY = True
602
+ #Store metadata AFTER FAISS succeeds.
603
+ store_chunks_in_db(new_chunks, faiss_ids)
604
+ #return the following to client which called this function as a JSON with message
605
+ return {
606
+ "message": f"{file.filename} uploaded and indexed successfully",
607
+ "chunks_added": len(new_chunks)
608
+ }
609
+
610
+ @app.post("/admin/upload-document")
611
+ async def upload_document(
612
+ file: UploadFile = File(...),
613
+ current_user: str = Depends(get_current_user)
614
+ ):
615
+ try:
616
+ return await upload_file(file, current_user)
617
+ except Exception as e:
618
+ # Always return a JSON with 'message' so client alert works
619
+ return {"message": f"Upload failed: {str(e)}"}
620
+
621
+
622
+ # In[]
623
+ # =====================================
624
+ # 📦 Request Models
625
+ # =====================================
626
+ class Message(BaseModel):
627
+ role: str
628
+ content: str
629
+
630
+
631
+ class ChatRequest(BaseModel):
632
+ model: Optional[str] = None
633
+ messages: List[Message]
634
+ temperature: Optional[float] = 0.7
635
+ reference_style: Optional[str] = "both"
636
+
637
+
638
+ # =====================================
639
+ # 🌍 Tamil Detection
640
+ # =====================================
641
+ def contains_tamil(text: str) -> bool:
642
+ return bool(re.search(r'[\u0B80-\u0BFF]', text))
643
+
644
+ # In[]
645
+
646
+ # =====================================
647
+ # 🛡 Retrieval Evidence Detection
648
+ # =====================================
649
+ def has_retrieved_context(messages: List[Message]) -> bool:
650
+ """
651
+ Detects whether WebUI injected retrieved document context.
652
+ Looks for common RAG markers like 'Source', 'Page', etc.
653
+ """
654
+ for m in messages:
655
+ content = m.content.lower()
656
+ if "source:" in content or "page" in content or "document:" in content:
657
+ return True
658
+ return False
659
+
660
+
661
+ def refusal_response(reason: str):
662
+ return {
663
+ "id": "chatcmpl-local",
664
+ "object": "chat.completion",
665
+ "created": int(datetime.now().timestamp()),
666
+ "model": "control-layer",
667
+ "choices": [
668
+ {
669
+ "index": 0,
670
+ "message": {
671
+ "role": "assistant",
672
+ "content": reason
673
+ },
674
+ "finish_reason": "stop"
675
+ }
676
+ ],
677
+ "usage": {
678
+ "prompt_tokens": 0,
679
+ "completion_tokens": 0,
680
+ "total_tokens": 0
681
+ }
682
+ }
683
+
684
+
685
+ def apply_reference_style(assistant_message, references_map, style):
686
+
687
+ # REMOVE existing inline references first
688
+ assistant_message = re.sub(r"\([^)]*\.pdf[^)]*\)", "", assistant_message, flags=re.IGNORECASE)
689
+ # REMOVE existing bibliography
690
+ assistant_message = re.sub(r"References:.*", "", assistant_message, flags=re.IGNORECASE | re.DOTALL)
691
+
692
+ # INLINE ONLY
693
+ if style == "inline":
694
+ for doc_marker, ref_text in references_map.items():
695
+ assistant_message = assistant_message.replace(
696
+ doc_marker, f"({ref_text})"
697
+ )
698
+
699
+ # LIST ONLY
700
+ elif style == "list":
701
+ used_markers = re.findall(r"\[Doc\d+\]", assistant_message)
702
+
703
+ assistant_message = re.sub(r"\[Doc\d+\]", "", assistant_message) # REMOVE INLINE MARKERS
704
+
705
+ refs_list = []
706
+ for doc_marker in used_markers:
707
+ if doc_marker in references_map:
708
+ ref = references_map[doc_marker]
709
+ if ref not in refs_list:
710
+ refs_list.append(ref)
711
+
712
+ if not refs_list:
713
+ refs_list = list(references_map.values())
714
+
715
+ if refs_list:
716
+ assistant_message = assistant_message.replace("References:", "")
717
+ assistant_message += "<br><br><br><b>References:</b><br>"
718
+ assistant_message += "<br>".join(f"- {r}" for r in refs_list)
719
+
720
+ # BOTH
721
+ elif style == "both":
722
+ used_markers = re.findall(r"\[Doc\d+\]", assistant_message)
723
+
724
+ for doc_marker, ref_text in references_map.items():
725
+ assistant_message = assistant_message.replace(
726
+ doc_marker, f"({ref_text})"
727
+ )
728
+
729
+ refs_list = []
730
+ for doc_marker in used_markers:
731
+ if doc_marker in references_map:
732
+ ref = references_map[doc_marker]
733
+ if ref not in refs_list:
734
+ refs_list.append(ref)
735
+
736
+ if refs_list:
737
+ assistant_message += "<br><br><br><b>References:</b><br>"
738
+ assistant_message += "<br>".join(f"- {r}" for r in refs_list)
739
+
740
+ # NONE
741
+ elif style == "none":
742
+ assistant_message = assistant_message.replace("References:", "")
743
+
744
+ return assistant_message
745
+
746
+
747
+
748
+ # =====================================
749
+ # 🔐 Password Utilities
750
+ # =====================================
751
+ def hash_password(password: str) -> str:
752
+ return pwd_context.hash(password)
753
+
754
+ def verify_password(plain_password: str, hashed_password: str) -> bool:
755
+ return pwd_context.verify(plain_password, hashed_password)
756
+
757
+ def create_access_token(data: dict):
758
+ if "sub" not in data:
759
+ raise ValueError("Token data must include 'sub'")
760
+
761
+ return jwt.encode(data, SECRET_KEY, algorithm=ALGORITHM)
762
+
763
+
764
+
765
+ @app.get("/protected")
766
+ def protected_route(current_user: str = Depends(get_current_user)):
767
+ return {"message": f"Hello {current_user}"}
768
+
769
+ #root endpoint
770
+ @app.get("/")
771
+ def serve_ui():
772
+ return FileResponse("FastAPI_Client/index.html")
773
+
774
+ @app.get("/v1/models")
775
+ def list_models():
776
+ return {
777
+ "object": "list",
778
+ "data": [
779
+ {
780
+ "id": model,
781
+ "object": "model",
782
+ "created": 0,
783
+ "owned_by": "local"
784
+ }
785
+ for model in ALLOWED_MODELS
786
+ ]
787
+ }
788
+
789
+ # In[]
790
+
791
+ # =====================================
792
+ # 💬 Chat Endpoint
793
+ # =====================================
794
+ @app.post("/v1/chat/completions")
795
+ async def chat_completion(request: ChatRequest):
796
+
797
+ #Model selection from WebUI
798
+ selected_model = request.model or DEFAULT_MODEL
799
+
800
+ if selected_model not in ALLOWED_MODELS:
801
+ raise HTTPException(
802
+ status_code=400,
803
+ detail=f"Model '{selected_model}' is not allowed."
804
+ )
805
+
806
+ user_message = request.messages[-1].content
807
+ # 🔎 Step 0: Retrieve relevant chunks from FAISS + SQLite
808
+ retrieved_chunks = retrieve_relevant_chunks(user_message, top_k=5) # returns list of dicts
809
+ print("Number of chunks:", len(retrieved_chunks))
810
+ #Build context for LLM with inline references (source attribution)
811
+ context_parts = []
812
+ for i, c in enumerate(retrieved_chunks, start=1):
813
+ chunk_text = clean_text(c["text"])
814
+
815
+ context_parts.append(
816
+ f"""
817
+ [Doc{i}]
818
+ Document: {c['source']}
819
+ Page: {c['page']}
820
+ Content:
821
+ {chunk_text}
822
+ """
823
+ )
824
+
825
+ '''
826
+ for i, c in enumerate(retrieved_chunks, start=1):
827
+ chunk_text = c['text'].replace("[", "").replace("]", "")
828
+ context_parts.append(
829
+ f"[Doc{i}] {chunk_text}"
830
+ #f"Source [Doc{i}] | Document: {c['source']} | Page: {c['page']}\n{chunk_text}"
831
+ )
832
+ '''
833
+
834
+
835
+ rag_context = "\n\n".join(context_parts)
836
+
837
+ references_map = {}
838
+ for i, c in enumerate(retrieved_chunks, start=1):
839
+ #references_map[f"[Doc{i}]"] = f"{c['source']} (Page {c['page']})"
840
+ #references_map[f"[Doc{i}]"] = f"<a href='/uploads/{c['source']}#page={c['page']}' target='_blank'>{c['source']} (Page {c['page']})</a>"
841
+ references_map[f"[Doc{i}]"] = (
842
+ f"<a href='/uploads/{c['source']}#page={c['page']}' target='_blank'>"
843
+ f"{c['source']} — Page {c['page']}</a>"
844
+ )
845
+
846
+
847
+ #Hallucination Control: Evidence Gate (Pre-LLM)
848
+ if not retrieved_chunks:
849
+ print("🚫 BLOCKED BEFORE LLM CALL — No retrieved evidence detected.")
850
+ return refusal_response(
851
+ "The answer is not found in the local documents provided by KKT."
852
+ )
853
+
854
+ #Language handling
855
+ if contains_tamil(user_message):
856
+ system_prompt = "You are a helpful AI assistant. Always respond only in Tamil."
857
+ else:
858
+ system_prompt = "You are a helpful AI assistant. Answer ONLY using the provided document context. If the answer is not in the context, say the information is not available in the documents."
859
+
860
+ #style = request.dict().get("reference_style", "both")
861
+ style = request.reference_style or "both" #✅ Included newly
862
+
863
+ if style == "none":
864
+ citation_instruction = "STRICTLY DO NOT include any citations or markers."
865
+ rules_text = """
866
+ - Do NOT include any citation markers like [Doc1].
867
+ - Do NOT include any References section.
868
+ """
869
+ elif style == "inline":
870
+ citation_instruction = "Include inline citation markers like [Doc1]."
871
+ rules_text = """
872
+ - Use ONLY the markers [Doc1], [Doc2], etc.
873
+ - Do NOT write document names yourself.
874
+ - Do NOT invent citations.
875
+ - Do NOT include any References section.
876
+ """
877
+ elif style == "list":
878
+ citation_instruction = "STRICTLY DO NOT include any inline citation markers like [Doc1]."
879
+ rules_text = """
880
+ - Do NOT include any inline citation markers like [Doc1].
881
+ - Do NOT write document names yourself.
882
+ - Do NOT invent citations.
883
+ - Do NOT include any References section.
884
+ """
885
+ elif style == "both":
886
+ citation_instruction = "STRICTLY include citation markers like [Doc1], [Doc2] in every factual sentence."
887
+ rules_text = """
888
+ - Use ONLY the markers [Doc1], [Doc2], etc.
889
+ - Do NOT write document names yourself.
890
+ - Do NOT invent citations.
891
+ """
892
+
893
+ #Inject retrieved context as system message
894
+ system_prompt = f"""
895
+ You are a document-grounded AI assistant.
896
+
897
+ Answer the question ONLY using the provided context.
898
+
899
+ {citation_instruction}
900
+
901
+ Rules:
902
+ {rules_text}
903
+
904
+ If the answer is not present in the context, say the information is not available.
905
+
906
+ Context:
907
+ {rag_context}
908
+ """
909
+
910
+ '''
911
+ #old prompt where citations were not displayed as per check box selection
912
+ system_prompt = f"""
913
+ You are a document-grounded AI assistant.
914
+
915
+ Answer the question ONLY using the provided context.
916
+
917
+ Citation Rules:
918
+ 1. Every factual statement MUST include a citation marker.
919
+ 2. Use ONLY the markers [Doc1], [Doc2], etc.
920
+ 3. Copy the marker EXACTLY as written.
921
+ 4. Do NOT write document names yourself.
922
+ 5. Do NOT invent citations.
923
+
924
+ If the answer is not present in the context, say the information is not available.
925
+
926
+ Context:
927
+ {rag_context}
928
+ """
929
+ '''
930
+
931
+
932
+
933
+ # 3 Forward request to Ollama
934
+ #final_messages = [{"role": "system", "content": system_prompt}]
935
+ final_messages = [
936
+ {
937
+ "role": "system",
938
+ "content": system_prompt
939
+ }
940
+ ]
941
+
942
+
943
+
944
+ for m in request.messages:
945
+ final_messages.append({
946
+ "role": m.role.lower(),
947
+ "content": m.content
948
+ })
949
+ print("MODEL:", selected_model)
950
+ print("AVAILABLE MODELS:", ALLOWED_MODELS)
951
+ print("SENDING TO GROQ:", final_messages)
952
+ print("GROQ KEY:", os.getenv("GROQ_API_KEY"))
953
+
954
+ try:
955
+ async with httpx.AsyncClient(timeout=120.0) as client:
956
+ response = await client.post(
957
+ url="https://api.groq.com/openai/v1/chat/completions", #uses cloud server
958
+ headers={
959
+ "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}", #environment variable set using Windows Power Shell
960
+ "Content-Type": "application/json"
961
+ },
962
+ json={
963
+ "model": selected_model,
964
+ "messages": final_messages,
965
+ "temperature": request.temperature,
966
+ "stream": False
967
+ }
968
+ )
969
+
970
+
971
+ if response.status_code != 200:
972
+ print("STATUS:", response.status_code)
973
+ print("ERROR:", response.text)
974
+ raise HTTPException(status_code=500, detail="Groq API Error")
975
+ result = response.json()
976
+ #assistant_message = result["message"]["content"] #This works for Ollama not for Groq
977
+ #assistant_message = result["choices"][0]["message"]["content"]
978
+
979
+ assistant_message = result.get("choices", [{}])[0].get("message", {}).get("content", "")
980
+
981
+ # Determine reference style: inline, list, or both
982
+ #style = request.dict().get("reference_style", "both").lower()
983
+ style = (request.reference_style or "both").lower()
984
+ if style not in ["inline", "list", "both", "none"]:
985
+ style = "both"
986
+ assistant_message = apply_reference_style(assistant_message, references_map, style)
987
+
988
+
989
+ except Exception as e:
990
+ print("FULL ERROR:", e)
991
+ raise
992
+
993
+
994
+ #Return OpenAI-compatible response
995
+ return {
996
+ "id": "chatcmpl-local",
997
+ "object": "chat.completion",
998
+ "created": int(datetime.now().timestamp()),
999
+ "model": selected_model,
1000
+ "choices": [
1001
+ {
1002
+ "index": 0,
1003
+ "message": {
1004
+ "role": "assistant",
1005
+ "content": assistant_message
1006
+ },
1007
+ "finish_reason": "stop"
1008
+ }
1009
+ ],
1010
+ "usage": {
1011
+ "prompt_tokens": 0,
1012
+ "completion_tokens": 0,
1013
+ "total_tokens": 0
1014
+ }
1015
+ }
1016
+
1017
+
1018
+
1019
+
config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Apr 13 19:22:03 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+ #NOTE
8
+ # All uploaded files are saved in the doc_ingestion folder where the server file exixsts
9
+
10
+ #BASE_DIR -- full absolute directory path of the current script file
11
+ #UPLOAD_FOLDER -- This folder is created if not available see startup_event() in the server file
12
+ #DB_FILE = "kkt_SQLite_DB.db"
13
+ #DB_PATH_FILE -- database file config.db in the same directory as this script (BASE_DIR)
14
+ #FAISS_INDEX_PATH = os.path.join(BASE_DIR, "faiss.index")
15
+
16
+
17
+ import os
18
+
19
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
20
+
21
+ UPLOAD_FOLDER = os.path.join(BASE_DIR, "doc_ingestion") #Folder is created in the startup_event() in the server file
22
+ DB_FILE = "kkt_SQLite_DB.db"
23
+ DB_PATH_FILE = os.path.join(BASE_DIR, DB_FILE) #this path is used in the server file to connect the dtata base
24
+
25
+ FAISS_INDEX_PATH = os.path.join(BASE_DIR, "faiss.index")
doc_ingestion/AIML_Unit1_RMD_ECE.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e12a289078f97daafeade764a58c9c217b5d25d2ba69c056bbf4f338046cad46
3
+ size 2663371
doc_ingestion/AIML_Unit2_RMD_ECE.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d68a2034ebc91200784fd39eae4818f407811e26e9d134183b710d2f3c28f663
3
+ size 3598628
faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e5420cf84512c0c5438b7c21831c098ba4a155f1c5b47ae2013f538732472e
3
+ size 1085522
kkt_SQLite_DB.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4cd7fa92c9fec7ae91c60ac2c43f7e039dc8a9a8fb1498602987035caf4158c
3
+ size 389120
rag/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Mar 4 12:55:04 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+
rag/chunker.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Mar 3 16:41:30 2026
4
+
5
+ @author: THYAGHARAJAN
6
+
7
+ Reads PDFs from kkt_AIML_PDFs/
8
+ Chunk into fixed size segments
9
+ Return list of chunks with metadata
10
+ """
11
+
12
+ import os
13
+ from typing import List, Dict
14
+ from pypdf import PdfReader
15
+ from docx import Document
16
+ import pandas as pd
17
+ from PIL import Image
18
+ import pytesseract
19
+ import cv2
20
+ from pytesseract import Output
21
+
22
+ from utils.text_cleanerV2 import clean_text
23
+
24
+
25
+ import shutil
26
+
27
+ tesseract_path = shutil.which("tesseract")
28
+ if tesseract_path:
29
+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
30
+
31
+
32
+ class DocChunker:
33
+ """
34
+ Handles document ingestion and text chunking.
35
+ """
36
+
37
+ def __init__(self, doc_folder: str, chunk_size: int = 500, overlap: int = 50):
38
+ self.doc_folder = doc_folder
39
+ self.chunk_size = chunk_size
40
+ self.overlap = overlap
41
+
42
+ # ---------------------------------------------------
43
+ # Load and Parse Documents (PDF, DOCX, Excel, Images)
44
+ # ---------------------------------------------------
45
+
46
+ def load_pdfs(self) -> List[Dict]:
47
+ """
48
+ Reads all supported documents and returns page-level texts with metadata.
49
+ (Method name preserved for compatibility.)
50
+ """
51
+ documents = []
52
+
53
+ for filename in os.listdir(self.doc_folder):
54
+ file_path = os.path.join(self.doc_folder, filename)
55
+ ext = filename.lower().split(".")[-1]
56
+
57
+ try:
58
+ # ---------------- PDF ----------------
59
+ if ext == "pdf":
60
+ reader = PdfReader(file_path)
61
+ for page_number, page in enumerate(reader.pages, start=1):
62
+ text = page.extract_text()
63
+ if text:
64
+ documents.append({
65
+ "text": text.strip(),
66
+ "source": filename,
67
+ "page": page_number
68
+ })
69
+
70
+ # ---------------- DOCX ----------------
71
+ elif ext == "docx":
72
+ doc = Document(file_path)
73
+ full_text = "\n".join([p.text for p in doc.paragraphs])
74
+ documents.append({
75
+ "text": full_text.strip(),
76
+ "source": filename,
77
+ "page": 1
78
+ })
79
+
80
+ # ---------------- Excel ----------------
81
+ elif ext in ["xlsx", "xls"]:
82
+ df = pd.read_excel(file_path)
83
+ documents.append({
84
+ "text": df.to_string(),
85
+ "source": filename,
86
+ "page": 1
87
+ })
88
+
89
+ # ---------------- Image (OCR) ----------------
90
+ elif ext in ["png", "jpg", "jpeg"]:
91
+
92
+ # Read image with OpenCV
93
+ img = cv2.imread(file_path)
94
+
95
+ # Detect orientation
96
+ osd = pytesseract.image_to_osd(img, output_type=Output.DICT)
97
+ angle = osd["rotate"]
98
+
99
+ if angle == 90:
100
+ img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
101
+ elif angle == 180:
102
+ img = cv2.rotate(img, cv2.ROTATE_180)
103
+ elif angle == 270:
104
+ img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
105
+
106
+ # Convert to grayscale
107
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
108
+
109
+ # Resize for better OCR
110
+ gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
111
+
112
+ # Apply threshold
113
+ thresh = cv2.threshold(
114
+ gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
115
+ )[1]
116
+
117
+ # OCR
118
+ text = pytesseract.image_to_string(thresh, config="--psm 6")
119
+
120
+ documents.append({
121
+ "text": text.strip(),
122
+ "source": filename,
123
+ "page": 1
124
+ })
125
+
126
+ except Exception as e:
127
+ print(f"Error processing {filename}: {e}")
128
+
129
+ return documents
130
+
131
+ # ---------------------------------------------------
132
+ # Chunk Text
133
+ # ---------------------------------------------------
134
+
135
+ def chunk_documents(self) -> List[Dict]:
136
+ """
137
+ Splits document text into smaller chunks.
138
+ Returns list of chunks with metadata.
139
+ """
140
+ pages = self.load_pdfs()
141
+ chunks = []
142
+
143
+ for page in pages:
144
+ raw_text = page["text"]
145
+ cleaned_text = clean_text(raw_text)
146
+
147
+ start = 0
148
+ while start < len(cleaned_text):
149
+ end = start + self.chunk_size
150
+ chunk_text = cleaned_text[start:end]
151
+
152
+ chunks.append({
153
+ "text": chunk_text,
154
+ "source": page["source"],
155
+ "page": page["page"]
156
+ })
157
+
158
+ start += self.chunk_size - self.overlap
159
+
160
+ return chunks
rag/qdrant_retriever.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Mar 3 14:40:03 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+
8
+ from rag.base_retriever import BaseRetriever
9
+
10
+ class QdrantRetriever(BaseRetriever):
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ httpx
4
+ pydantic
5
+ python-multipart
6
+
7
+ sentence-transformers
8
+ faiss-cpu
9
+
10
+ pypdf
11
+ python-docx
12
+ pandas
13
+
14
+ pytesseract
15
+ opencv-python-headless
16
+ Pillow
17
+
18
+ passlib[bcrypt]
19
+ python-jose[cryptography]
20
+ requests
utils/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Mar 4 12:55:04 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+
utils/admin_fns.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sat Apr 11 15:03:04 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+ #line 20 import avoids circular error
8
+ from utils.core_imports import get_current_user
9
+
10
+ from fastapi import APIRouter, Depends, HTTPException
11
+ import os
12
+ import shutil
13
+ import threading #used for rebuild_faiss_index line 54
14
+
15
+
16
+ router = APIRouter() #the app acted as a proxy to Ollama in main file
17
+
18
+ @router.delete("/admin/delete-document")
19
+ def delete_document(filename: str,current_user: str = Depends(get_current_user)):
20
+ from utils.core_imports import get_db, rebuild_faiss_index, get_upload_folder #lazy import
21
+
22
+ filename = filename.strip()
23
+
24
+ conn = get_db()
25
+ cursor = conn.cursor()
26
+
27
+ # Check existence
28
+ cursor.execute(
29
+ "SELECT faiss_id FROM document_chunks WHERE TRIM(source)=?",
30
+ (filename,)
31
+ )
32
+
33
+ rows = cursor.fetchall()
34
+
35
+ if not rows:
36
+ conn.close()
37
+ raise HTTPException(status_code=404, detail="Document not found")
38
+
39
+ # ✅ DELETE (this was missing in your code)
40
+ cursor.execute(
41
+ "DELETE FROM document_chunks WHERE TRIM(source)=?",
42
+ (filename,)
43
+ )
44
+
45
+ conn.commit()
46
+ conn.close()
47
+
48
+ # Delete physical file
49
+ file_path = os.path.join(get_upload_folder(), filename)
50
+ if os.path.exists(file_path):
51
+ os.remove(file_path)
52
+ rebuild_faiss_index()
53
+
54
+ # Rebuild FAISS in background
55
+ threading.Thread(target=rebuild_faiss_index).start()
56
+
57
+ '''
58
+ #used for debugging. Found & was not converted to %26
59
+ print(f"Incoming filename: [{filename}]")
60
+
61
+ cursor.execute("SELECT DISTINCT source FROM document_chunks")
62
+ all_sources = cursor.fetchall()
63
+
64
+ print("DB sources:")
65
+ for s in all_sources:
66
+ print(f"[{s[0]}]")
67
+ '''
68
+
69
+ return {"message": f"{filename} removed from index"}
70
+
71
+
72
+
73
+ @router.delete("/admin/delete-folder")
74
+ def delete_folder(folder: str,current_user: str = Depends(get_current_user)):
75
+ from utils.core_imports import get_db, rebuild_faiss_index
76
+ conn = get_db()
77
+ cursor = conn.cursor()
78
+
79
+ cursor.execute(
80
+ "DELETE FROM document_chunks WHERE source LIKE ?",
81
+ (f"%{folder}%",)
82
+ )
83
+
84
+ deleted_count = cursor.rowcount
85
+ conn.commit()
86
+ conn.close()
87
+ if deleted_count == 0:
88
+ raise HTTPException(status_code=404, detail="Folder not found")
89
+
90
+ threading.Thread(target=rebuild_faiss_index).start()
91
+
92
+ return {"message": f"{folder} folder removed from index"}
93
+
94
+
95
+
96
+ @router.delete("/admin/reset-index")
97
+ def reset_index(confirm: bool = False,current_user: str = Depends(get_current_user)):
98
+ from utils.core_imports import get_db, rebuild_faiss_index, get_upload_folder #lazy import to avoid circular import
99
+ #confirm button will be displayed
100
+ if not confirm:
101
+ return {"message": "Set confirm=true to reset index"}
102
+
103
+ conn = get_db()
104
+ cursor = conn.cursor()
105
+
106
+ cursor.execute("DELETE FROM document_chunks")
107
+ # delete ALL rows in document_chunks table
108
+
109
+ conn.commit()
110
+ conn.close()
111
+
112
+ #delete the files in the UPLOAD dir doc_ingestion folder
113
+ upload_dir = get_upload_folder()
114
+
115
+ shutil.rmtree(upload_dir)
116
+ os.makedirs(upload_dir, exist_ok=True)
117
+
118
+ threading.Thread(target=rebuild_faiss_index).start()
119
+
120
+ return {"message": "Index reset completed"}
121
+
122
+
123
+
124
+ @router.get("/admin/list-documents")
125
+ def list_documents(current_user: str = Depends(get_current_user)):
126
+ from utils.core_imports import get_db
127
+
128
+ conn = get_db()
129
+ cursor = conn.cursor()
130
+
131
+ cursor.execute("""
132
+ SELECT source, COUNT(*) as chunks
133
+ FROM document_chunks
134
+ GROUP BY source
135
+ """)
136
+
137
+ rows = cursor.fetchall()
138
+ conn.close()
139
+
140
+ docs = [{"document": r[0], "chunks": r[1]} for r in rows]
141
+
142
+ return {"documents": docs}
utils/core_imports.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sun Apr 12 22:31:05 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+ #NOTE
8
+ #If you chnage the kkt_FastAPI_server file name, then change the SERVER_MODULE name here
9
+
10
+ # =====================================
11
+ # 🔁 Centralized Server Module Import
12
+ # =====================================
13
+ SERVER_MODULE = "app"
14
+
15
+ import importlib
16
+
17
+
18
+ def _get_server():
19
+ return importlib.import_module(SERVER_MODULE)
20
+
21
+
22
+ # =====================================
23
+ # 🔁 Lazy Re-export functions
24
+ # =====================================
25
+
26
+ def get_db():
27
+ return _get_server().get_db()
28
+
29
+
30
+ def rebuild_faiss_index():
31
+ return _get_server().rebuild_faiss_index()
32
+
33
+
34
+ def get_current_user():
35
+ return _get_server().get_current_user
36
+
37
+
38
+ def get_upload_folder():
39
+ return _get_server().UPLOAD_FOLDER
utils/text_cleanerV1.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Mar 4 12:25:04 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+
8
+ import re
9
+
10
+ def clean_text(text: str) -> str:
11
+ """
12
+ Basic PDF text cleaning for RAG.
13
+ Removes URLs, repeated lines, extra whitespace, and noise.
14
+ """
15
+
16
+ # Remove URLs
17
+ text = re.sub(r"http\S+", "", text)
18
+
19
+ # Remove standalone dates like 02-03-2026
20
+ text = re.sub(r"\b\d{2}-\d{2}-\d{4}\b", "", text)
21
+
22
+ # Remove QR instruction lines
23
+ text = re.sub(r"Scan the QR code.*", "", text, flags=re.IGNORECASE)
24
+
25
+ # Remove extra spaces
26
+ text = re.sub(r"\s+", " ", text)
27
+
28
+ # Remove duplicate consecutive words
29
+ words = text.split()
30
+ cleaned_words = []
31
+ prev_word = None
32
+ for word in words:
33
+ if word != prev_word:
34
+ cleaned_words.append(word)
35
+ prev_word = word
36
+
37
+ return " ".join(cleaned_words).strip()
utils/text_cleanerV2.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Mar 4 12:25:04 2026
4
+
5
+ @author: THYAGHARAJAN
6
+ """
7
+
8
+ import re
9
+ import unicodedata
10
+
11
+ def clean_text(text: str) -> str:
12
+ """
13
+ Main cleaning pipeline.
14
+
15
+ Order matters.
16
+ """
17
+ if not text:
18
+ return ""
19
+
20
+ text = normalize_unicode(text)
21
+ text = remove_non_printable(text)
22
+ text = remove_headers_footers(text)
23
+ text = remove_page_numbers(text)
24
+ text = remove_extra_whitespace(text)
25
+ text = remove_duplicate_words(text)
26
+
27
+ return text
28
+
29
+
30
+
31
+ def normalize_unicode(text: str) -> str:
32
+ """
33
+ Normalize unicode characters to a consistent form.
34
+ Prevents strange PDF extraction artifacts.
35
+ """
36
+ return unicodedata.normalize("NFKC", text)
37
+
38
+
39
+ def remove_extra_whitespace(text: str) -> str:
40
+ """
41
+ Remove excessive spaces, tabs, and line breaks.
42
+ """
43
+ text = re.sub(r"[ \t]+", " ", text) # collapse spaces
44
+ text = re.sub(r"\n\s*\n+", "\n\n", text) # max 2 newlines
45
+ return text.strip()
46
+
47
+
48
+ def remove_page_numbers(text: str) -> str:
49
+ """
50
+ Remove standalone page numbers.
51
+ Example: '12', '- 23 -', 'Page 5'
52
+ """
53
+ text = re.sub(r"\n\s*[-–]?\s*\d+\s*[-–]?\s*\n", "\n", text)
54
+ text = re.sub(r"Page\s*\d+", "", text, flags=re.IGNORECASE)
55
+ return text
56
+
57
+
58
+ def remove_headers_footers(text: str) -> str:
59
+ """
60
+ Remove common repeating header/footer patterns.
61
+ Customize if needed.
62
+ """
63
+ patterns = [
64
+ r"Copyright\s.*",
65
+ r"All rights reserved.*",
66
+ r"www\.[^\s]+",
67
+ r"http[s]?://[^\s]+",
68
+ ]
69
+
70
+ for pattern in patterns:
71
+ text = re.sub(pattern, "", text, flags=re.IGNORECASE)
72
+
73
+ return text
74
+
75
+
76
+ def remove_non_printable(text: str) -> str:
77
+ """
78
+ Remove non-printable characters from PDF extraction.
79
+ """
80
+ return "".join(ch for ch in text if ch.isprintable())
81
+
82
+
83
+ def remove_duplicate_words(text: str) -> str:
84
+ words = text.split()
85
+ cleaned_words = []
86
+ prev_word = None
87
+
88
+ for word in words:
89
+ if word != prev_word:
90
+ cleaned_words.append(word)
91
+ prev_word = word
92
+
93
+ return " ".join(cleaned_words)
94
+
95
+
96
+
97
+