Upload 22 files
Browse files- .gitattributes +4 -0
- Dockerfile +21 -0
- FastAPI_Client/admin.html +52 -0
- FastAPI_Client/admin.js +157 -0
- FastAPI_Client/index.html +88 -0
- FastAPI_Client/script.js +383 -0
- FastAPI_Client/style.css +145 -0
- READMEmodify.md +172 -0
- app.py +1019 -0
- config.py +25 -0
- doc_ingestion/AIML_Unit1_RMD_ECE.pdf +3 -0
- doc_ingestion/AIML_Unit2_RMD_ECE.pdf +3 -0
- faiss.index +3 -0
- kkt_SQLite_DB.db +3 -0
- rag/__init__.py +7 -0
- rag/chunker.py +160 -0
- rag/qdrant_retriever.py +10 -0
- requirements.txt +20 -0
- utils/__init__.py +7 -0
- utils/admin_fns.py +142 -0
- utils/core_imports.py +39 -0
- utils/text_cleanerV1.py +37 -0
- utils/text_cleanerV2.py +97 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
doc_ingestion/AIML_Unit1_RMD_ECE.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
doc_ingestion/AIML_Unit2_RMD_ECE.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
faiss.index filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
kkt_SQLite_DB.db filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies (OCR support)
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
tesseract-ocr \
|
| 8 |
+
libgl1 \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy all files
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
# Install Python dependencies
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Expose Hugging Face port
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
# Run FastAPI
|
| 21 |
+
CMD ["uvicorn", "kkt_FastAPI_serverV22:app", "--host", "0.0.0.0", "--port", "7860"]
|
FastAPI_Client/admin.html
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<title>KKT RAG Admin</title>
|
| 5 |
+
<style>
|
| 6 |
+
body{font-family:Arial;padding:30px}
|
| 7 |
+
table{border-collapse:collapse;width:70%}
|
| 8 |
+
th,td{border:1px solid #ccc;padding:8px}
|
| 9 |
+
button{padding:5px 10px;margin:4px}
|
| 10 |
+
</style>
|
| 11 |
+
</head>
|
| 12 |
+
|
| 13 |
+
<body>
|
| 14 |
+
|
| 15 |
+
<h2>KKT Secure Modular RAG Engine — Admin</h2>
|
| 16 |
+
|
| 17 |
+
<!-- LOGIN FOR ADMIN -->
|
| 18 |
+
<div id="authSection">
|
| 19 |
+
<h3>Admin Login</h3>
|
| 20 |
+
<input type="text" id="adminUser" placeholder="Username">
|
| 21 |
+
<input type="password" id="adminPass" placeholder="Password">
|
| 22 |
+
<button onclick="loginAdmin()">Login</button>
|
| 23 |
+
</div>
|
| 24 |
+
|
| 25 |
+
<!-- ADMIN PANEL -->
|
| 26 |
+
<div id="adminPanel" style="display:none;">
|
| 27 |
+
|
| 28 |
+
<h3>Upload and Index Document</h3>
|
| 29 |
+
<input type="file" id="fileInput">
|
| 30 |
+
<button onclick="uploadDocument()">Upload & Index</button>
|
| 31 |
+
|
| 32 |
+
<br><br>
|
| 33 |
+
|
| 34 |
+
<button onclick="loadDocs()">Refresh Documents</button>
|
| 35 |
+
<button onclick="deleteFolder()">Delete Knowledge</button>
|
| 36 |
+
<button onclick="resetIndex()">Reset Index</button>
|
| 37 |
+
|
| 38 |
+
<br><br>
|
| 39 |
+
|
| 40 |
+
<table id="docTable">
|
| 41 |
+
<tr>
|
| 42 |
+
<th>Document</th>
|
| 43 |
+
<th>Chunks</th>
|
| 44 |
+
<th>Action</th>
|
| 45 |
+
</tr>
|
| 46 |
+
</table>
|
| 47 |
+
|
| 48 |
+
</div>
|
| 49 |
+
|
| 50 |
+
<script src="/static/admin.js?v=1"></script>
|
| 51 |
+
</body>
|
| 52 |
+
</html>
|
FastAPI_Client/admin.js
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
let token = "";
|
| 2 |
+
|
| 3 |
+
// ------------------ ADMIN LOGIN ------------------
|
| 4 |
+
async function loginAdmin() {
|
| 5 |
+
const username = document.getElementById("adminUser").value;
|
| 6 |
+
const password = document.getElementById("adminPass").value;
|
| 7 |
+
|
| 8 |
+
if (!username || !password) {
|
| 9 |
+
alert("Please enter username and password");
|
| 10 |
+
return;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
const formData = new URLSearchParams();
|
| 14 |
+
formData.append("username", username);
|
| 15 |
+
formData.append("password", password);
|
| 16 |
+
|
| 17 |
+
try {
|
| 18 |
+
const response = await fetch("/login", {
|
| 19 |
+
method: "POST",
|
| 20 |
+
headers: {"Content-Type":"application/x-www-form-urlencoded"},
|
| 21 |
+
body: formData
|
| 22 |
+
});
|
| 23 |
+
|
| 24 |
+
const data = await response.json();
|
| 25 |
+
|
| 26 |
+
if (response.ok) {
|
| 27 |
+
token = data.access_token;
|
| 28 |
+
localStorage.setItem("token", token);
|
| 29 |
+
|
| 30 |
+
alert("Login Successful");
|
| 31 |
+
document.getElementById("authSection").style.display = "none";
|
| 32 |
+
document.getElementById("adminPanel").style.display = "block";
|
| 33 |
+
loadDocs();
|
| 34 |
+
} else {
|
| 35 |
+
alert(data.detail || "Login Failed");
|
| 36 |
+
}
|
| 37 |
+
} catch (err) {
|
| 38 |
+
console.error(err);
|
| 39 |
+
alert("Login request failed");
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
// ------------------ UPLOAD DOCUMENT ------------------
|
| 44 |
+
async function uploadDocument() {
|
| 45 |
+
const fileInput = document.getElementById("fileInput");
|
| 46 |
+
const file = fileInput.files[0];
|
| 47 |
+
|
| 48 |
+
if (!file) {
|
| 49 |
+
alert("Please select a file");
|
| 50 |
+
return;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
alert("File is being uploaded and indexed. Please wait ...");
|
| 54 |
+
|
| 55 |
+
const formData = new FormData();
|
| 56 |
+
formData.append("file", file);
|
| 57 |
+
|
| 58 |
+
try {
|
| 59 |
+
const res = await fetch("/admin/upload-document", {
|
| 60 |
+
headers: {"Authorization": "Bearer " + token},
|
| 61 |
+
method: "POST",
|
| 62 |
+
body: formData
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
const data = await res.json();
|
| 66 |
+
|
| 67 |
+
if (data.message) {
|
| 68 |
+
alert(data.message);
|
| 69 |
+
} else if (data.error) {
|
| 70 |
+
alert(data.error);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
loadDocs();
|
| 74 |
+
} catch (err) {
|
| 75 |
+
console.error(err);
|
| 76 |
+
alert("Upload failed");
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// ------------------ OTHER EXISTING FUNCTIONS ------------------
|
| 81 |
+
async function loadDocs() {
|
| 82 |
+
try {
|
| 83 |
+
const response = await fetch("/admin/list-documents", {
|
| 84 |
+
headers: {"Authorization": "Bearer " + token}
|
| 85 |
+
});
|
| 86 |
+
|
| 87 |
+
if (!response.ok) throw new Error("Failed to fetch documents");
|
| 88 |
+
|
| 89 |
+
const data = await response.json();
|
| 90 |
+
const table = document.getElementById("docTable");
|
| 91 |
+
if (!table) return;
|
| 92 |
+
|
| 93 |
+
let rows = `<tr><th>Document</th><th>Chunks</th><th>Action</th></tr>`;
|
| 94 |
+
|
| 95 |
+
if (data.documents.length === 0) {
|
| 96 |
+
rows += `<tr><td colspan="3">No documents found</td></tr>`;
|
| 97 |
+
} else {
|
| 98 |
+
data.documents.forEach(doc => {
|
| 99 |
+
rows += `<tr>
|
| 100 |
+
<td><a href="/uploads/${encodeURIComponent(doc.document)}" target="_blank">${doc.document}</a></td>
|
| 101 |
+
<td>${doc.chunks}</td>
|
| 102 |
+
<td><button onclick='deleteDocument(${JSON.stringify(doc.document)}, event)'>Delete</button></td>
|
| 103 |
+
</tr>`;
|
| 104 |
+
});
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
table.innerHTML = rows;
|
| 108 |
+
} catch (err) {
|
| 109 |
+
console.error(err);
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
async function deleteDocument(name, event) {
|
| 114 |
+
if (!confirm("Delete " + name + " ?")) return;
|
| 115 |
+
|
| 116 |
+
try {
|
| 117 |
+
const btn = event.target;
|
| 118 |
+
btn.disabled = true;
|
| 119 |
+
btn.innerText = "Deleting...";
|
| 120 |
+
|
| 121 |
+
const params = new URLSearchParams({ filename: name });
|
| 122 |
+
const res = await fetch(`/admin/delete-document?${params.toString()}`, {
|
| 123 |
+
headers: {"Authorization": "Bearer " + token},
|
| 124 |
+
method: "DELETE"
|
| 125 |
+
});
|
| 126 |
+
|
| 127 |
+
const data = await res.json();
|
| 128 |
+
alert(data.message || "Deleted successfully");
|
| 129 |
+
await loadDocs();
|
| 130 |
+
} catch (err) {
|
| 131 |
+
console.error(err);
|
| 132 |
+
alert("Error deleting file");
|
| 133 |
+
}
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
async function deleteFolder() {
|
| 137 |
+
const folder = prompt("Enter folder name to delete");
|
| 138 |
+
if (!folder) return;
|
| 139 |
+
|
| 140 |
+
await fetch(`/admin/delete-folder?folder=${encodeURIComponent(folder)}`, {
|
| 141 |
+
method: "DELETE",
|
| 142 |
+
headers: {"Authorization": "Bearer " + token}
|
| 143 |
+
});
|
| 144 |
+
|
| 145 |
+
loadDocs();
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
async function resetIndex() {
|
| 149 |
+
if (!confirm("Reset entire index?")) return;
|
| 150 |
+
|
| 151 |
+
await fetch("/admin/reset-index?confirm=true", {
|
| 152 |
+
headers: {"Authorization": "Bearer " + token},
|
| 153 |
+
method: "DELETE"
|
| 154 |
+
});
|
| 155 |
+
|
| 156 |
+
loadDocs();
|
| 157 |
+
}
|
FastAPI_Client/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<title>KKT Secure Modular RAG Engine</title>
|
| 7 |
+
|
| 8 |
+
<link rel="stylesheet" href="/static/style.css">
|
| 9 |
+
|
| 10 |
+
</head>
|
| 11 |
+
|
| 12 |
+
<body>
|
| 13 |
+
|
| 14 |
+
<h1>KKT Secure Modular RAG Engine</h1>
|
| 15 |
+
|
| 16 |
+
<!-- LOGIN / REGISTER -->
|
| 17 |
+
|
| 18 |
+
<div id="authSection">
|
| 19 |
+
|
| 20 |
+
<h2>Register</h2>
|
| 21 |
+
|
| 22 |
+
<input type="text" id="regUser" placeholder="Username">
|
| 23 |
+
<input type="password" id="regPass" placeholder="Password">
|
| 24 |
+
<button onclick="register()">Register</button>
|
| 25 |
+
|
| 26 |
+
<h2>Login</h2>
|
| 27 |
+
|
| 28 |
+
<input type="text" id="loginUser" placeholder="Username">
|
| 29 |
+
<input type="password" id="loginPass" placeholder="Password">
|
| 30 |
+
<button onclick="login()">Login</button>
|
| 31 |
+
|
| 32 |
+
</div>
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
<!-- MAIN APP -->
|
| 36 |
+
|
| 37 |
+
<div id="mainApp" style="display:none;">
|
| 38 |
+
|
| 39 |
+
<h2>Select Model</h2>
|
| 40 |
+
|
| 41 |
+
<div class="model-row">
|
| 42 |
+
<select id="modelSelect"></select>
|
| 43 |
+
<button onclick="loadModels()">Refresh Models</button>
|
| 44 |
+
</div>
|
| 45 |
+
|
| 46 |
+
<!-- NEW: Model change confirmation message -->
|
| 47 |
+
<div id="modelInfo" style="margin-top:8px;color:green;font-weight:bold;"></div>
|
| 48 |
+
|
| 49 |
+
<div class="ref-section">
|
| 50 |
+
|
| 51 |
+
<label>
|
| 52 |
+
<input type="checkbox" id="refInline">
|
| 53 |
+
<span>Inline Citation</span>
|
| 54 |
+
</label>
|
| 55 |
+
|
| 56 |
+
<label>
|
| 57 |
+
<input type="checkbox" id="refList">
|
| 58 |
+
<span>Bibliography</span>
|
| 59 |
+
</label>
|
| 60 |
+
|
| 61 |
+
</div>
|
| 62 |
+
|
| 63 |
+
<h2>Chat</h2>
|
| 64 |
+
|
| 65 |
+
<div id="chatWindow"></div>
|
| 66 |
+
|
| 67 |
+
<div id="chatControls">
|
| 68 |
+
|
| 69 |
+
<input type="text" id="questionInput" placeholder="Ask a question">
|
| 70 |
+
|
| 71 |
+
<button onclick="sendQuestion()">Send</button>
|
| 72 |
+
|
| 73 |
+
<button onclick="startVoice()">🎤 Voice</button>
|
| 74 |
+
|
| 75 |
+
<button onclick="readChat()">🔈Read</button>
|
| 76 |
+
|
| 77 |
+
<button onclick="stopVoice()">Stop</button>
|
| 78 |
+
|
| 79 |
+
<button onclick="saveChatAsPDF()">📄 Save as PDF</button>
|
| 80 |
+
|
| 81 |
+
</div>
|
| 82 |
+
|
| 83 |
+
</div>
|
| 84 |
+
|
| 85 |
+
<script src="/static/script.js"></script>
|
| 86 |
+
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
FastAPI_Client/script.js
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
let token = "";
|
| 2 |
+
let recognition;
|
| 3 |
+
let previousModel = "";
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
/* ------------------ REGISTER ------------------ */
|
| 7 |
+
|
| 8 |
+
async function register()
|
| 9 |
+
{
|
| 10 |
+
const username = document.getElementById("regUser").value;
|
| 11 |
+
const password = document.getElementById("regPass").value;
|
| 12 |
+
|
| 13 |
+
const response = await fetch("/register",
|
| 14 |
+
{
|
| 15 |
+
method: "POST",
|
| 16 |
+
headers:
|
| 17 |
+
{
|
| 18 |
+
"Content-Type":"application/json"
|
| 19 |
+
},
|
| 20 |
+
body: JSON.stringify(
|
| 21 |
+
{
|
| 22 |
+
username:username,
|
| 23 |
+
password:password
|
| 24 |
+
})
|
| 25 |
+
});
|
| 26 |
+
|
| 27 |
+
const data = await response.json();
|
| 28 |
+
|
| 29 |
+
if(response.ok)
|
| 30 |
+
{
|
| 31 |
+
alert("Successfully Registered");
|
| 32 |
+
}
|
| 33 |
+
else
|
| 34 |
+
{
|
| 35 |
+
alert(data.detail || "Registration failed");
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
/* ------------------ LOGIN ------------------ */
|
| 41 |
+
|
| 42 |
+
async function login()
|
| 43 |
+
{
|
| 44 |
+
|
| 45 |
+
const username = document.getElementById("loginUser").value;
|
| 46 |
+
const password = document.getElementById("loginPass").value;
|
| 47 |
+
|
| 48 |
+
const formData = new URLSearchParams();
|
| 49 |
+
formData.append("username",username);
|
| 50 |
+
formData.append("password",password);
|
| 51 |
+
|
| 52 |
+
const response = await fetch("/login",
|
| 53 |
+
{
|
| 54 |
+
method:"POST",
|
| 55 |
+
headers:
|
| 56 |
+
{
|
| 57 |
+
"Content-Type":"application/x-www-form-urlencoded"
|
| 58 |
+
},
|
| 59 |
+
body:formData
|
| 60 |
+
});
|
| 61 |
+
|
| 62 |
+
const data = await response.json();
|
| 63 |
+
|
| 64 |
+
if(response.ok)
|
| 65 |
+
{
|
| 66 |
+
|
| 67 |
+
token = data.access_token;
|
| 68 |
+
localStorage.setItem("token", token);
|
| 69 |
+
|
| 70 |
+
alert("Login Successful");
|
| 71 |
+
|
| 72 |
+
document.getElementById("authSection").style.display="none";
|
| 73 |
+
document.getElementById("mainApp").style.display="block";
|
| 74 |
+
|
| 75 |
+
loadModels();
|
| 76 |
+
|
| 77 |
+
}
|
| 78 |
+
else
|
| 79 |
+
{
|
| 80 |
+
alert("Login Failed");
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
/* ------------------ LOAD MODELS ------------------ */
|
| 87 |
+
async function loadModels()
|
| 88 |
+
{
|
| 89 |
+
|
| 90 |
+
const select = document.getElementById("modelSelect");
|
| 91 |
+
// 🧠 Refresh warning message
|
| 92 |
+
const modelInfo = document.getElementById("modelInfo");
|
| 93 |
+
if (modelInfo) {
|
| 94 |
+
modelInfo.innerText =
|
| 95 |
+
"🤖 If you click Refresh Models button, Default LLM model will be selected.";
|
| 96 |
+
modelInfo.style.color = "blue";
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
const response = await fetch("/v1/models");
|
| 100 |
+
|
| 101 |
+
const data = await response.json();
|
| 102 |
+
|
| 103 |
+
select.innerHTML="";
|
| 104 |
+
|
| 105 |
+
data.data.forEach(model =>
|
| 106 |
+
{
|
| 107 |
+
|
| 108 |
+
let option = document.createElement("option");
|
| 109 |
+
|
| 110 |
+
option.value = model.id;
|
| 111 |
+
|
| 112 |
+
option.text = model.id;
|
| 113 |
+
|
| 114 |
+
select.appendChild(option);
|
| 115 |
+
|
| 116 |
+
});
|
| 117 |
+
|
| 118 |
+
// ✅ SAFE default selection AFTER population
|
| 119 |
+
if (select.options.length > 0) {
|
| 120 |
+
select.selectedIndex = 0;
|
| 121 |
+
previousModel = select.value;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
// model change handler
|
| 125 |
+
select.onchange = function () {
|
| 126 |
+
const modelInfo = document.getElementById("modelInfo");
|
| 127 |
+
|
| 128 |
+
if (!modelInfo) return;
|
| 129 |
+
|
| 130 |
+
const message =
|
| 131 |
+
"You switched to model " + select.value +
|
| 132 |
+
". This model will be used for all new questions.";
|
| 133 |
+
|
| 134 |
+
modelInfo.innerText =
|
| 135 |
+
"🤖 You switched to: " + select.value +
|
| 136 |
+
". This model will be used for ALL new questions.";
|
| 137 |
+
|
| 138 |
+
modelInfo.style.color = "green";
|
| 139 |
+
|
| 140 |
+
// 🔊 SPEAK MODEL CHANGE
|
| 141 |
+
speak(message);
|
| 142 |
+
};
|
| 143 |
+
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
/* ------------------ FILE UPLOAD ------------------ */
|
| 148 |
+
|
| 149 |
+
async function uploadFile()
|
| 150 |
+
{
|
| 151 |
+
|
| 152 |
+
const file = document.getElementById("fileInput").files[0];
|
| 153 |
+
|
| 154 |
+
const formData = new FormData();
|
| 155 |
+
|
| 156 |
+
formData.append("file",file);
|
| 157 |
+
|
| 158 |
+
const response = await fetch("/upload",
|
| 159 |
+
{
|
| 160 |
+
method:"POST",
|
| 161 |
+
headers:
|
| 162 |
+
{
|
| 163 |
+
Authorization:"Bearer "+token
|
| 164 |
+
},
|
| 165 |
+
body:formData
|
| 166 |
+
});
|
| 167 |
+
|
| 168 |
+
const data = await response.json();
|
| 169 |
+
|
| 170 |
+
alert(data.message); // shows only the message string
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
/* ------------------ CHAT ------------------ */
|
| 176 |
+
|
| 177 |
+
async function sendQuestion()
|
| 178 |
+
{
|
| 179 |
+
|
| 180 |
+
const inputBox = document.getElementById("questionInput");
|
| 181 |
+
|
| 182 |
+
const question = inputBox.value;
|
| 183 |
+
|
| 184 |
+
if(!question.trim()) return;
|
| 185 |
+
|
| 186 |
+
addUserMessage(question);
|
| 187 |
+
|
| 188 |
+
inputBox.value = ""; // clears the input box
|
| 189 |
+
|
| 190 |
+
const model = document.getElementById("modelSelect").value;
|
| 191 |
+
|
| 192 |
+
const inline = document.getElementById("refInline").checked;
|
| 193 |
+
const list = document.getElementById("refList").checked;
|
| 194 |
+
|
| 195 |
+
let refStyle = "both";
|
| 196 |
+
if (inline && !list) refStyle = "inline";
|
| 197 |
+
else if (!inline && list) refStyle = "list";
|
| 198 |
+
else if (!inline && !list) refStyle = "none";
|
| 199 |
+
|
| 200 |
+
const response = await fetch("/v1/chat/completions",
|
| 201 |
+
{
|
| 202 |
+
method:"POST",
|
| 203 |
+
headers:
|
| 204 |
+
{
|
| 205 |
+
"Content-Type":"application/json",
|
| 206 |
+
"Authorization":"Bearer " + token
|
| 207 |
+
},
|
| 208 |
+
body:JSON.stringify(
|
| 209 |
+
{
|
| 210 |
+
model:model,
|
| 211 |
+
reference_style: refStyle,
|
| 212 |
+
messages:[
|
| 213 |
+
{role:"user",content:question}
|
| 214 |
+
]
|
| 215 |
+
})
|
| 216 |
+
});
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
const data = await response.json();
|
| 220 |
+
|
| 221 |
+
const answer = data.choices[0].message.content;
|
| 222 |
+
//Clean unwanted model artifacts
|
| 223 |
+
let displayAnswer = answer
|
| 224 |
+
.replace(/\[\d+\]/g, "") // remove [1], [2]
|
| 225 |
+
.replace(/Note: Citation marker.*$/s, "");
|
| 226 |
+
|
| 227 |
+
addBotMessage(displayAnswer);
|
| 228 |
+
let cleanAnswer = answer
|
| 229 |
+
.replace(/<[^>]*>/g, "") //
|
| 230 |
+
.replace(/\([^)]*\.pdf[^)]*\)/gi, "")
|
| 231 |
+
.replace(/\[\d+\]/g, "")
|
| 232 |
+
.replace(/Note: Citation marker.*$/s, "")
|
| 233 |
+
.replace(/\s+/g, " ") //
|
| 234 |
+
.trim();
|
| 235 |
+
|
| 236 |
+
cleanAnswer = cleanAnswer.split("References:")[0];
|
| 237 |
+
|
| 238 |
+
speak(cleanAnswer);
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
document.addEventListener("DOMContentLoaded", function() {
|
| 242 |
+
|
| 243 |
+
document.getElementById("questionInput").addEventListener("keydown", function(event) {
|
| 244 |
+
if (event.key === "Enter") {
|
| 245 |
+
event.preventDefault();
|
| 246 |
+
sendQuestion();
|
| 247 |
+
}
|
| 248 |
+
});
|
| 249 |
+
|
| 250 |
+
});
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
/* ------------------ CHAT DISPLAY ------------------ */
|
| 254 |
+
|
| 255 |
+
function addUserMessage(text)
|
| 256 |
+
{
|
| 257 |
+
|
| 258 |
+
const chat = document.getElementById("chatWindow");
|
| 259 |
+
|
| 260 |
+
const div = document.createElement("div");
|
| 261 |
+
|
| 262 |
+
div.className="userMessage";
|
| 263 |
+
|
| 264 |
+
div.innerText=text;
|
| 265 |
+
|
| 266 |
+
chat.appendChild(div);
|
| 267 |
+
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
function addBotMessage(text)
|
| 272 |
+
{
|
| 273 |
+
|
| 274 |
+
const chat = document.getElementById("chatWindow");
|
| 275 |
+
|
| 276 |
+
const div = document.createElement("div");
|
| 277 |
+
|
| 278 |
+
div.className="botMessage";
|
| 279 |
+
|
| 280 |
+
div.innerHTML = text;
|
| 281 |
+
|
| 282 |
+
chat.appendChild(div);
|
| 283 |
+
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
/* ------------------ VOICE INPUT ------------------ */
|
| 288 |
+
|
| 289 |
+
function startVoice()
|
| 290 |
+
{
|
| 291 |
+
|
| 292 |
+
recognition = new webkitSpeechRecognition();
|
| 293 |
+
|
| 294 |
+
recognition.lang="en-US";
|
| 295 |
+
|
| 296 |
+
recognition.onresult=function(event)
|
| 297 |
+
{
|
| 298 |
+
document.getElementById("questionInput").value = event.results[0][0].transcript;
|
| 299 |
+
};
|
| 300 |
+
|
| 301 |
+
recognition.start();
|
| 302 |
+
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
function stopVoice()
|
| 306 |
+
{
|
| 307 |
+
|
| 308 |
+
if(recognition)
|
| 309 |
+
{
|
| 310 |
+
recognition.stop();
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
speechSynthesis.cancel();
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
/* ------------------ VOICE OUTPUT ------------------ */
|
| 318 |
+
|
| 319 |
+
function speak(text)
|
| 320 |
+
{
|
| 321 |
+
|
| 322 |
+
//STOP any previous speech to avoid vrvrvr noise
|
| 323 |
+
speechSynthesis.cancel();
|
| 324 |
+
|
| 325 |
+
const speech = new SpeechSynthesisUtterance(text);
|
| 326 |
+
|
| 327 |
+
speech.lang="en-US";
|
| 328 |
+
|
| 329 |
+
speechSynthesis.speak(speech);
|
| 330 |
+
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
function readChat()
|
| 334 |
+
{
|
| 335 |
+
|
| 336 |
+
const messages = document.getElementsByClassName("botMessage");
|
| 337 |
+
|
| 338 |
+
if(messages.length === 0) return;
|
| 339 |
+
|
| 340 |
+
const lastMessage = messages[messages.length - 1];
|
| 341 |
+
|
| 342 |
+
const chatText = lastMessage.innerText;
|
| 343 |
+
|
| 344 |
+
//Remove inline references from speaking
|
| 345 |
+
let cleanText = chatText
|
| 346 |
+
.replace(/\([^)]*\.pdf[^)]*\)/gi, "") // remove full inline refs
|
| 347 |
+
.replace(/\[\d+\]/g, ""); // remove [1], [2]
|
| 348 |
+
|
| 349 |
+
//Remove References section from speaking
|
| 350 |
+
cleanText = cleanText.split("References:")[0];
|
| 351 |
+
|
| 352 |
+
speak(cleanText);
|
| 353 |
+
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
function saveChatAsPDF()
|
| 357 |
+
{
|
| 358 |
+
const chatWindow = document.getElementById("chatWindow");
|
| 359 |
+
|
| 360 |
+
const originalContent = chatWindow.innerHTML;
|
| 361 |
+
|
| 362 |
+
const printWindow = window.open('', '', 'width=800,height=600');
|
| 363 |
+
|
| 364 |
+
printWindow.document.write(`
|
| 365 |
+
<html>
|
| 366 |
+
<head>
|
| 367 |
+
<title>Chat Export</title>
|
| 368 |
+
<style>
|
| 369 |
+
body { font-family: Arial; padding: 20px; }
|
| 370 |
+
.userMessage { color: blue; margin: 5px 0; }
|
| 371 |
+
.botMessage { color: black; margin: 5px 0; }
|
| 372 |
+
</style>
|
| 373 |
+
</head>
|
| 374 |
+
<body>
|
| 375 |
+
<h2>Chat History</h2>
|
| 376 |
+
${originalContent}
|
| 377 |
+
</body>
|
| 378 |
+
</html>
|
| 379 |
+
`);
|
| 380 |
+
|
| 381 |
+
printWindow.document.close();
|
| 382 |
+
printWindow.print();
|
| 383 |
+
}
|
FastAPI_Client/style.css
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
body
|
| 2 |
+
{
|
| 3 |
+
font-family: Arial;
|
| 4 |
+
background: #f4f4f4;
|
| 5 |
+
margin: 40px;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
h1
|
| 9 |
+
{
|
| 10 |
+
text-align: center;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
#authSection
|
| 14 |
+
{
|
| 15 |
+
background: white;
|
| 16 |
+
padding: 20px;
|
| 17 |
+
width: 400px;
|
| 18 |
+
margin: auto;
|
| 19 |
+
border-radius: 8px;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
#mainApp
|
| 23 |
+
{
|
| 24 |
+
margin-top: 30px;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
input
|
| 28 |
+
{
|
| 29 |
+
width: 100%;
|
| 30 |
+
padding: 10px;
|
| 31 |
+
margin-top: 10px;
|
| 32 |
+
font-size: 18px; /* 🔥 increase input text size */
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
button
|
| 36 |
+
{
|
| 37 |
+
padding: 10px;
|
| 38 |
+
margin-top: 10px;
|
| 39 |
+
cursor: pointer;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
#chatWindow
|
| 43 |
+
{
|
| 44 |
+
height: 400px;
|
| 45 |
+
background: white;
|
| 46 |
+
overflow-y: auto;
|
| 47 |
+
padding: 10px;
|
| 48 |
+
border-radius: 8px;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
.userMessage
|
| 52 |
+
{
|
| 53 |
+
text-align:left;
|
| 54 |
+
color:blue;
|
| 55 |
+
margin:5px;
|
| 56 |
+
font-family:Calibri;
|
| 57 |
+
font-size:28px;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
.botMessage
|
| 61 |
+
{
|
| 62 |
+
text-align: left;
|
| 63 |
+
color:black;
|
| 64 |
+
margin:5px;
|
| 65 |
+
font-family:Arial;
|
| 66 |
+
font-size:20px;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
#chatControls
|
| 70 |
+
{
|
| 71 |
+
margin-top: 10px;
|
| 72 |
+
display: flex;
|
| 73 |
+
gap: 10px;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
#chatControls
|
| 77 |
+
{
|
| 78 |
+
margin-top: 10px;
|
| 79 |
+
display: flex;
|
| 80 |
+
gap: 10px;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/* ===== Reference checkbox alignment fix ===== */
|
| 84 |
+
.ref-section
|
| 85 |
+
{
|
| 86 |
+
display: flex;
|
| 87 |
+
align-items: center;
|
| 88 |
+
gap: 20px;
|
| 89 |
+
margin-top: 25px; /* 🔥 move downward */
|
| 90 |
+
flex-wrap: nowrap; /* 🔥 force single row */
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
.ref-section label
|
| 94 |
+
{
|
| 95 |
+
display: flex;
|
| 96 |
+
align-items: center;
|
| 97 |
+
gap: 8px;
|
| 98 |
+
white-space: nowrap; /* ===== Prevent label text from wrapping ===== */
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
/* ===== Fine-tune checkbox vertical position ===== */
|
| 102 |
+
.ref-section input[type="checkbox"]
|
| 103 |
+
{
|
| 104 |
+
margin: 0;
|
| 105 |
+
transform: translateY(-1px); /* 🔥 move checkbox UP */
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.model-row
|
| 109 |
+
{
|
| 110 |
+
display: flex;
|
| 111 |
+
gap: 10px;
|
| 112 |
+
align-items: center; /* 🔥 fix alignment */
|
| 113 |
+
margin-top: -15px;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.model-row select
|
| 117 |
+
{
|
| 118 |
+
width: 250px; /* 🔥 fixed reasonable width */
|
| 119 |
+
height: 36px; /* 🔥 controls actual height */
|
| 120 |
+
font-size: 16px;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.model-row button
|
| 124 |
+
{
|
| 125 |
+
height: 36px;
|
| 126 |
+
padding: 0 15px;
|
| 127 |
+
white-space: nowrap;
|
| 128 |
+
display: flex; /* 🔥 important */
|
| 129 |
+
align-items: center; /* 🔥 vertical centering */
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
/* ===== Fine-tune dropdown vertical position ===== */
|
| 134 |
+
#modelSelect
|
| 135 |
+
{
|
| 136 |
+
transform: translateY(2px); /* 🔥 move DOWN */
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
/* ===== Slightly bigger checkbox ===== */
|
| 141 |
+
.ref-section input[type="checkbox"]
|
| 142 |
+
{
|
| 143 |
+
transform: scale(1.5); /* 🔥 increase size */
|
| 144 |
+
margin: 0;
|
| 145 |
+
}
|
READMEmodify.md
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 FastAPI RAG Server (V22)
|
| 2 |
+
|
| 3 |
+
A production-ready Retrieval-Augmented Generation (RAG) API built using FastAPI. This project enables document ingestion, semantic search using embeddings, and LLM-based question answering.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 📌 Features
|
| 8 |
+
|
| 9 |
+
- 📄 PDF document ingestion and chunking
|
| 10 |
+
- 🧹 Text cleaning pipeline
|
| 11 |
+
- 🔎 Semantic search using FAISS
|
| 12 |
+
- 🤖 LLM integration (Groq / OpenAI compatible API)
|
| 13 |
+
- 🔐 Authentication with hashed passwords (Passlib + JWT)
|
| 14 |
+
- ⚡ FastAPI async endpoints
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## 🏗️ Project Structure
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
.
|
| 22 |
+
├── kkt_FastAPI_serverV22.py # Main FastAPI server
|
| 23 |
+
├── chunker.py # Document loading & chunking
|
| 24 |
+
├── text_cleanerV2.py # Text preprocessing pipeline
|
| 25 |
+
├── indexer.py # Embedding + FAISS index
|
| 26 |
+
├── requirements.txt # Dependencies
|
| 27 |
+
└── README.md
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
---
|
| 31 |
+
|
| 32 |
+
## ⚙️ Installation
|
| 33 |
+
|
| 34 |
+
### 1. Clone the repository
|
| 35 |
+
```bash
|
| 36 |
+
git clone <your-repo-url>
|
| 37 |
+
cd <repo-folder>
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
### 2. Create environment (recommended)
|
| 41 |
+
```bash
|
| 42 |
+
conda create -n rag_env python=3.11
|
| 43 |
+
conda activate rag_env
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### 3. Install dependencies
|
| 47 |
+
```bash
|
| 48 |
+
pip install -r requirements.txt
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### Required dependencies
|
| 52 |
+
Ensure the following are present:
|
| 53 |
+
```
|
| 54 |
+
fastapi
|
| 55 |
+
uvicorn
|
| 56 |
+
passlib[bcrypt]
|
| 57 |
+
python-jose[cryptography]
|
| 58 |
+
sentence-transformers
|
| 59 |
+
faiss-cpu
|
| 60 |
+
httpx
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## 🔑 Environment Variables
|
| 66 |
+
|
| 67 |
+
Set your API keys before running:
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
export GROQ_API_KEY="your_api_key_here"
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
On Windows PowerShell:
|
| 74 |
+
```powershell
|
| 75 |
+
$env:GROQ_API_KEY="your_api_key_here"
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## ▶️ Running the Server
|
| 81 |
+
|
| 82 |
+
```bash
|
| 83 |
+
uvicorn kkt_FastAPI_serverV22:app --host 0.0.0.0 --port 8000
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
Access API docs:
|
| 87 |
+
|
| 88 |
+
👉 http://localhost:8000/docs
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## 🔄 Workflow
|
| 93 |
+
|
| 94 |
+
1. Upload documents (PDF)
|
| 95 |
+
2. Chunk and clean text
|
| 96 |
+
3. Generate embeddings using SentenceTransformers
|
| 97 |
+
4. Store vectors in FAISS
|
| 98 |
+
5. Query → retrieve relevant chunks
|
| 99 |
+
6. Send context to LLM → generate answer
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
## 🔐 Authentication
|
| 104 |
+
|
| 105 |
+
- Password hashing: Passlib (bcrypt)
|
| 106 |
+
- Token system: JWT (python-jose)
|
| 107 |
+
|
| 108 |
+
Example flow:
|
| 109 |
+
1. Register user
|
| 110 |
+
2. Login → receive token
|
| 111 |
+
3. Use token in protected endpoints
|
| 112 |
+
|
| 113 |
+
---
|
| 114 |
+
|
| 115 |
+
## 🐳 Docker (Optional)
|
| 116 |
+
|
| 117 |
+
### Build image
|
| 118 |
+
```bash
|
| 119 |
+
docker build -t rag-fastapi .
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
### Run container
|
| 123 |
+
```bash
|
| 124 |
+
docker run -p 8000:8000 rag-fastapi
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
## ⚠️ Common Issues
|
| 130 |
+
|
| 131 |
+
### ❌ ModuleNotFoundError: passlib
|
| 132 |
+
Fix:
|
| 133 |
+
```bash
|
| 134 |
+
pip install passlib[bcrypt]
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
### ❌ API timeout issues
|
| 138 |
+
- Increase timeout in `httpx.AsyncClient`
|
| 139 |
+
- Check API key validity
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## 📈 Future Improvements
|
| 144 |
+
|
| 145 |
+
- Streaming responses
|
| 146 |
+
- Multi-document indexing
|
| 147 |
+
- Role-based authentication
|
| 148 |
+
- UI integration (React/Streamlit)
|
| 149 |
+
|
| 150 |
+
---
|
| 151 |
+
|
| 152 |
+
## 🧠 Tech Stack
|
| 153 |
+
|
| 154 |
+
- FastAPI
|
| 155 |
+
- FAISS
|
| 156 |
+
- SentenceTransformers
|
| 157 |
+
- Passlib
|
| 158 |
+
- JWT (python-jose)
|
| 159 |
+
- httpx
|
| 160 |
+
|
| 161 |
+
---
|
| 162 |
+
|
| 163 |
+
## 📄 License
|
| 164 |
+
|
| 165 |
+
MIT License
|
| 166 |
+
|
| 167 |
+
---
|
| 168 |
+
|
| 169 |
+
## 👨💻 Author
|
| 170 |
+
|
| 171 |
+
Developed by Thyagharajan K K
|
| 172 |
+
|
app.py
ADDED
|
@@ -0,0 +1,1019 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Fri Feb 20 13:39:23 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
# In[]
|
| 8 |
+
#NOTE
|
| 9 |
+
#if you change the file name kkt_FastAPI_serverV21 change the same in core_imports.py file
|
| 10 |
+
#All uploaded files are saved in the doc_ingestion folder where the server file exixsts
|
| 11 |
+
# In[]
|
| 12 |
+
|
| 13 |
+
"""
|
| 14 |
+
orchestrator layer
|
| 15 |
+
(torch26_cu124_trans_unsloth) D: cd kkt_secure_modular_rag_engine
|
| 16 |
+
uvicorn kkt_FastAPI_server:app --host 0.0.0.0 --port 8000
|
| 17 |
+
on Browser: http://localhost:3009
|
| 18 |
+
V1 works well when selected llama3:8b
|
| 19 |
+
V2 works with WebUI, LLM Model selecttion, to add pdf files, system prompt, RAG prompt at WebUI are intergrated
|
| 20 |
+
but Halusination could not be controlled and some layers could not communicate with WebUI
|
| 21 |
+
Don't delete V2'
|
| 22 |
+
V3 server uses separate FastAPI Swagard client. Working
|
| 23 |
+
V4 HTML client is used. Ollama output is not obtained
|
| 24 |
+
V5 LLM provides output without references.
|
| 25 |
+
V6 References included
|
| 26 |
+
V7 References and Citations improved
|
| 27 |
+
V8 User interface was improved and tested
|
| 28 |
+
V9 doc_ingestion folder and its functionality was changed for opening the reference files on the browser.
|
| 29 |
+
But chat window didn't show any link
|
| 30 |
+
V10 Inline reference was modified. working
|
| 31 |
+
V11 facility needed to delete the knowledge were added but not tested completely
|
| 32 |
+
V12 same upload_file used for admin. Single ingestion system. does chunking,embedding,FAISS update,DB storage. Many modifications done.
|
| 33 |
+
V13 Changes made but not checked
|
| 34 |
+
V14 checked working well
|
| 35 |
+
V15 Reference modes are working well. Still at the end it reads vr vr vr
|
| 36 |
+
V16 speaking vr vr vr at the end was removed in the script.js and working fine.
|
| 37 |
+
Bibiliography refrences may be different from inline citation, because all bibiliography are not used by LLM. Checked working
|
| 38 |
+
V17 References are properly displayed on all the three modes. Checked OK
|
| 39 |
+
V18 File upload UI for user (in index.html) was removed.
|
| 40 |
+
V19 UI was corrected. Refrences were corrected. working fine.
|
| 41 |
+
V20 Save as PDF button was added in the index and script files
|
| 42 |
+
V21 if you change the file name kkt_FastAPI_serverV21 change the same in core_imports.py file
|
| 43 |
+
V22 Uses Groq, API key and cloud for deploying on Hugging Face
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
# In[]
|
| 47 |
+
import os
|
| 48 |
+
import re
|
| 49 |
+
import httpx
|
| 50 |
+
import threading
|
| 51 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File
|
| 52 |
+
#FastAPI application serves as a server for webui and client for local Ollama. This is to be done asynchronouly
|
| 53 |
+
#So, the app acts as a proxy to Ollama
|
| 54 |
+
from pydantic import BaseModel
|
| 55 |
+
from typing import List, Optional
|
| 56 |
+
from datetime import datetime
|
| 57 |
+
import sqlite3
|
| 58 |
+
from passlib.context import CryptContext
|
| 59 |
+
from jose import JWTError, jwt
|
| 60 |
+
from fastapi import Depends
|
| 61 |
+
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
|
| 62 |
+
import numpy as np
|
| 63 |
+
import faiss
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
from sentence_transformers import SentenceTransformer
|
| 68 |
+
from rag.chunker import DocChunker #import class
|
| 69 |
+
#from rag.indexer import build_vector_index #imports function
|
| 70 |
+
import requests
|
| 71 |
+
|
| 72 |
+
from fastapi.responses import FileResponse
|
| 73 |
+
from fastapi.staticfiles import StaticFiles
|
| 74 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
import shutil
|
| 79 |
+
from pathlib import Path
|
| 80 |
+
|
| 81 |
+
from utils.text_cleanerV2 import clean_text
|
| 82 |
+
from utils.admin_fns import router as admin_router
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
#from rag.retriever_factory import get_retriever
|
| 86 |
+
'''
|
| 87 |
+
from rag.citation_validator import validate_citations
|
| 88 |
+
from rag.hallucination_control import apply_confidence_filter
|
| 89 |
+
from rag.permission_gate import check_external_access
|
| 90 |
+
'''
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
from models.model_config import VECTOR_BACKEND, INDEX_PATH
|
| 94 |
+
|
| 95 |
+
# JWT Configuration
|
| 96 |
+
SECRET_KEY = "KKT_SUPER_SECRET_KEY_CHANGE_THIS"
|
| 97 |
+
ALGORITHM = "HS256"
|
| 98 |
+
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
| 99 |
+
|
| 100 |
+
#UPLOAD_FOLDER = "uploads"
|
| 101 |
+
EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
|
| 102 |
+
#EMBEDDING_MODEL → encodes text to vector
|
| 103 |
+
|
| 104 |
+
VECTOR_INDEX = None
|
| 105 |
+
#VECTOR_INDEX → FAISS index storing vectors
|
| 106 |
+
INDEX_READY = False #Used if query is given before the vector rebuilding is not completed after the app starts
|
| 107 |
+
|
| 108 |
+
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
| 109 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="login")
|
| 110 |
+
|
| 111 |
+
# In[]
|
| 112 |
+
'''
|
| 113 |
+
# =====================================
|
| 114 |
+
# 🔐 Allowed Models (Ollama)
|
| 115 |
+
# =====================================
|
| 116 |
+
ALLOWED_MODELS = [
|
| 117 |
+
"llama3:8b",
|
| 118 |
+
"llama3.1:8b",
|
| 119 |
+
"phi3:mini",
|
| 120 |
+
"Phi-3 Medium",
|
| 121 |
+
"mistral",
|
| 122 |
+
"qwen2.5:7b",
|
| 123 |
+
"deepseek-r1:7b",
|
| 124 |
+
"llama3.2:3b"
|
| 125 |
+
]
|
| 126 |
+
#
|
| 127 |
+
# Default model (if WebUI sends None)
|
| 128 |
+
DEFAULT_MODEL = "deepseek-r1:7b"
|
| 129 |
+
'''
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# =====================================
|
| 133 |
+
# 🔐 Allowed Models (Groq)
|
| 134 |
+
# =====================================
|
| 135 |
+
ALLOWED_MODELS = [
|
| 136 |
+
"llama-3.1-8b-instant",
|
| 137 |
+
"llama-3.3-70b-versatile",
|
| 138 |
+
"mixtral-8x7b-32768",
|
| 139 |
+
"gemma-7b-it"
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
DEFAULT_MODEL = "llama-3.1-8b-instant"
|
| 143 |
+
#GROQ_API_KEY -- Environment variable was set using setx GROQ_API_KEY "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"
|
| 144 |
+
#in Windows Powershell
|
| 145 |
+
|
| 146 |
+
#Ollama endpoint (since FastAPI runs on Windows host)
|
| 147 |
+
#OLLAMA_URL = "http://localhost:11434/api/chat" #Hugging Face Spaces does NOT support Ollama Delete this not needed
|
| 148 |
+
# In[] SERVER-SIDE DOCUMNET REPOSITORY (FOR REFERENCE)
|
| 149 |
+
from config import UPLOAD_FOLDER, DB_PATH_FILE, FAISS_INDEX_PATH
|
| 150 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 151 |
+
|
| 152 |
+
FAISS_LOCK = threading.Lock() #Ensure safe concurrent access. Prevent index corruption.
|
| 153 |
+
|
| 154 |
+
# In[] # FASTAPI SERVER
|
| 155 |
+
|
| 156 |
+
app = FastAPI(
|
| 157 |
+
title="KKT Secure Modular RAG Engine V11",
|
| 158 |
+
version="11.0.0",
|
| 159 |
+
description="Secure modular RAG pipeline with authentication, chunking, FAISS retrieval"
|
| 160 |
+
)
|
| 161 |
+
#returns an ASGI (Asynchronous Server Gateway Interface) compatibe app instance
|
| 162 |
+
#This app can be run by uvicorn server
|
| 163 |
+
#Title will be shown in the doc page of http://localhost:8000/docs
|
| 164 |
+
#app = FastAPI() #without metadata
|
| 165 |
+
|
| 166 |
+
app.add_middleware(
|
| 167 |
+
CORSMiddleware,
|
| 168 |
+
allow_origins=["*"],
|
| 169 |
+
allow_credentials=True,
|
| 170 |
+
allow_methods=["*"],
|
| 171 |
+
allow_headers=["*"],
|
| 172 |
+
)
|
| 173 |
+
#configures CORS (Cross-Origin Resource Sharing) of the app
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# UI files. Maps URL prefix /static → local folder FastAPI_Client
|
| 177 |
+
app.mount("/static", StaticFiles(directory="FastAPI_Client"), name="static")
|
| 178 |
+
#URL: /static/app.js will be mapped to the File: FastAPI_Client/app.js
|
| 179 |
+
|
| 180 |
+
# uploaded documents
|
| 181 |
+
app.mount("/uploads", StaticFiles(directory=UPLOAD_FOLDER), name="uploads") #uploaded files will be saved in doc_ingestion
|
| 182 |
+
app.include_router(admin_router) #registeres all admin endpoints with FastAPI
|
| 183 |
+
#UPLOAD_FOLDER = /kkt_secure_modular_rag_engine/doc_ingestion
|
| 184 |
+
#URL: /uploads/file1.pdf
|
| 185 |
+
#File: /kkt_secure_modular_rag_engine/doc_ingestion/file1.pdf
|
| 186 |
+
#uploaded documents are saved in the doc_ingestion folder but , when they are opened in the browser with folder name defined here
|
| 187 |
+
#It will not expose the actual folder in the server http://localhost:8000/uploads/AI_book.pdf
|
| 188 |
+
|
| 189 |
+
# In[] Admin Credentials
|
| 190 |
+
|
| 191 |
+
# Fixed admin user
|
| 192 |
+
ADMIN_USERNAME = "admin"
|
| 193 |
+
ADMIN_PASSWORD = "Tnivedha@123" # plain password (for testing)
|
| 194 |
+
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
| 195 |
+
ADMIN_PASSWORD_HASH = pwd_context.hash(ADMIN_PASSWORD)
|
| 196 |
+
|
| 197 |
+
def authenticate_admin(username: str, password: str):
|
| 198 |
+
if username == ADMIN_USERNAME and verify_password(password, ADMIN_PASSWORD_HASH):
|
| 199 |
+
return {"username": username} # returns user info for token
|
| 200 |
+
return None
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# In[]
|
| 204 |
+
# =====================================
|
| 205 |
+
# 🔐 Current User Dependency
|
| 206 |
+
# =====================================
|
| 207 |
+
def get_current_user(token: str = Depends(oauth2_scheme)):
|
| 208 |
+
try:
|
| 209 |
+
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 210 |
+
username: str = payload.get("sub")
|
| 211 |
+
if username is None:
|
| 212 |
+
raise HTTPException(status_code=401, detail="Invalid token")
|
| 213 |
+
return username
|
| 214 |
+
except JWTError:
|
| 215 |
+
raise HTTPException(status_code=401, detail="Invalid token")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
# In[] Data Base Tables
|
| 219 |
+
|
| 220 |
+
#Create SQLite connectection to the database file. Used in many places to connect to database
|
| 221 |
+
def get_db():
|
| 222 |
+
return sqlite3.connect(DB_PATH_FILE, check_same_thread=False)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# =====================================
|
| 226 |
+
# 🗄 SQLite Database Setup
|
| 227 |
+
# =====================================
|
| 228 |
+
def init_db():
|
| 229 |
+
conn = get_db() #connect DB file
|
| 230 |
+
cursor = conn.cursor()
|
| 231 |
+
|
| 232 |
+
# Users table (existing)
|
| 233 |
+
cursor.execute("""
|
| 234 |
+
CREATE TABLE IF NOT EXISTS users (
|
| 235 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 236 |
+
username TEXT UNIQUE NOT NULL,
|
| 237 |
+
hashed_password TEXT NOT NULL,
|
| 238 |
+
created_at TEXT NOT NULL
|
| 239 |
+
)
|
| 240 |
+
""")
|
| 241 |
+
|
| 242 |
+
# NEW: Chunk metadata table
|
| 243 |
+
cursor.execute("""
|
| 244 |
+
CREATE TABLE IF NOT EXISTS document_chunks (
|
| 245 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 246 |
+
faiss_id INTEGER,
|
| 247 |
+
source TEXT NOT NULL,
|
| 248 |
+
path TEXT,
|
| 249 |
+
page INTEGER NOT NULL,
|
| 250 |
+
text TEXT NOT NULL,
|
| 251 |
+
created_at TEXT NOT NULL
|
| 252 |
+
)
|
| 253 |
+
""")
|
| 254 |
+
|
| 255 |
+
conn.commit()
|
| 256 |
+
conn.close()
|
| 257 |
+
|
| 258 |
+
init_db() #Database is initialized
|
| 259 |
+
|
| 260 |
+
'''
|
| 261 |
+
Explanation
|
| 262 |
+
faiss_id : 525
|
| 263 |
+
source : "Unit 1 AI & Python Complete.pdf" only file name is stored
|
| 264 |
+
path : "D:/.../doc_ingestion/Unit 1 AI & Python Complete.pdf"
|
| 265 |
+
text : "Artificial Intelligence is..."
|
| 266 |
+
page : 7
|
| 267 |
+
created_at : timestamp
|
| 268 |
+
|
| 269 |
+
Each real chunk will look like
|
| 270 |
+
{
|
| 271 |
+
"source": "Unit 1 AI & Python Complete.pdf",
|
| 272 |
+
"text": "Artificial Intelligence is...",
|
| 273 |
+
"page": 7
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
After embedding, FAISS contains
|
| 277 |
+
FAISS ID --> vector(one chunk text)
|
| 278 |
+
'''
|
| 279 |
+
|
| 280 |
+
# In[]
|
| 281 |
+
|
| 282 |
+
# ========================================================
|
| 283 |
+
# 🔐Store and Retrieve Chunks via SQLite
|
| 284 |
+
# ========================================================
|
| 285 |
+
def store_chunks_in_db(chunks, faiss_ids):
|
| 286 |
+
conn = get_db() #connect DB file
|
| 287 |
+
cursor = conn.cursor()
|
| 288 |
+
|
| 289 |
+
for chunk, fid in zip(chunks, faiss_ids):
|
| 290 |
+
#chunks - list of dictionories with [{"source": "file1.pdf", "text": "...", "page": 3}]
|
| 291 |
+
#faiss_ids - each ID corresponds to one embedding vector.
|
| 292 |
+
#zip creates a pairing between the two lists chunks and faiss_ids:
|
| 293 |
+
cursor.execute("""
|
| 294 |
+
INSERT INTO document_chunks
|
| 295 |
+
(faiss_id, source, path, text, page, created_at)
|
| 296 |
+
VALUES (?, ?, ?, ?, ?, datetime('now'))
|
| 297 |
+
""", (
|
| 298 |
+
fid,
|
| 299 |
+
chunk["source"],
|
| 300 |
+
f"{UPLOAD_FOLDER}/{chunk['source']}",
|
| 301 |
+
chunk["text"],
|
| 302 |
+
chunk.get("page", 0)
|
| 303 |
+
))
|
| 304 |
+
|
| 305 |
+
conn.commit()
|
| 306 |
+
conn.close()
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def get_next_faiss_id():
|
| 310 |
+
#generates the next unique FAISS ID based on what’s already stored in the database.
|
| 311 |
+
#MAX(faiss_id)- Scans column faiss_id and Returns the largest value
|
| 312 |
+
#returns 0 if table is empty otherwise max_id + 1
|
| 313 |
+
|
| 314 |
+
conn = get_db() #connect DB file
|
| 315 |
+
cursor = conn.cursor()
|
| 316 |
+
|
| 317 |
+
cursor.execute("SELECT MAX(faiss_id) FROM document_chunks")
|
| 318 |
+
result = cursor.fetchone()[0]
|
| 319 |
+
#fetchone() returns a tuple (max_value,)
|
| 320 |
+
conn.close()
|
| 321 |
+
|
| 322 |
+
if result is None:
|
| 323 |
+
return 0
|
| 324 |
+
|
| 325 |
+
return result + 1
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def fetch_chunks_by_faiss_ids(faiss_ids):
|
| 329 |
+
|
| 330 |
+
conn = get_db() #connect DB file
|
| 331 |
+
cursor = conn.cursor()
|
| 332 |
+
|
| 333 |
+
placeholders = ",".join(["?"] * len(faiss_ids))
|
| 334 |
+
#if faiss_ids = [101, 205, 87] placeholders = "?,?,?" this is needed because SQL expects 3 values
|
| 335 |
+
#FAISS returns IDs in ranked order (most relevant first). But SQL does NOT preserve order. It may return wrong oreder [101, 205, 87]
|
| 336 |
+
query = f"""
|
| 337 |
+
SELECT faiss_id, text, source, page
|
| 338 |
+
FROM document_chunks
|
| 339 |
+
WHERE faiss_id IN ({placeholders})
|
| 340 |
+
"""
|
| 341 |
+
|
| 342 |
+
cursor.execute(query, faiss_ids)
|
| 343 |
+
rows = cursor.fetchall()
|
| 344 |
+
|
| 345 |
+
'''
|
| 346 |
+
rows will be list of tuples as given below
|
| 347 |
+
[
|
| 348 |
+
(101, "text1", "file1.pdf", 2),
|
| 349 |
+
(205, "text2", "file2.pdf", 5),
|
| 350 |
+
]
|
| 351 |
+
'''
|
| 352 |
+
conn.close()
|
| 353 |
+
|
| 354 |
+
results = []
|
| 355 |
+
#{ faiss_id → row_data }
|
| 356 |
+
id_to_row = {
|
| 357 |
+
row[0]: {
|
| 358 |
+
"faiss_id": row[0],
|
| 359 |
+
"text": row[1],
|
| 360 |
+
"source": row[2],
|
| 361 |
+
"page": row[3]
|
| 362 |
+
}
|
| 363 |
+
for row in rows
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
results = [id_to_row[fid] for fid in faiss_ids if fid in id_to_row] #reorder to preserve FAISS order
|
| 367 |
+
|
| 368 |
+
return results
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def retrieve_relevant_chunks(query, top_k=5):
|
| 372 |
+
#FAISS retrives faiss id and SQLite fetches metadata from that id
|
| 373 |
+
global VECTOR_INDEX, EMBEDDING_MODEL, INDEX_READY
|
| 374 |
+
#EMBEDDING_MODEL → encodes text to vector
|
| 375 |
+
#VECTOR_INDEX → FAISS index storing vectors
|
| 376 |
+
#INDEX_READY -> used to handle thesituation when vectors are unbulit at the starting of the app and query is given
|
| 377 |
+
if not INDEX_READY or VECTOR_INDEX is None:
|
| 378 |
+
raise HTTPException(status_code=503, detail="Index is still building") # raise HTTPException
|
| 379 |
+
if EMBEDDING_MODEL is None:
|
| 380 |
+
raise HTTPException(status_code=500, detail="Embedding model not loaded")
|
| 381 |
+
# Encode query. Does not normalize embeddings. So, FAISS uses L2 distance normally not cosine similarity equivalent
|
| 382 |
+
query_embedding = EMBEDDING_MODEL.encode([query])
|
| 383 |
+
query_embedding = np.array(query_embedding).astype("float32")
|
| 384 |
+
|
| 385 |
+
# FAISS search
|
| 386 |
+
with FAISS_LOCK:
|
| 387 |
+
k = min(top_k, VECTOR_INDEX.ntotal)
|
| 388 |
+
distances, indices = VECTOR_INDEX.search(query_embedding, top_k)
|
| 389 |
+
print("FAISS distances:", distances)
|
| 390 |
+
print("FAISS indices:", indices)
|
| 391 |
+
|
| 392 |
+
#To avoid potential crash in empty FAISS search
|
| 393 |
+
if indices is None or len(indices[0]) == 0:
|
| 394 |
+
return []
|
| 395 |
+
|
| 396 |
+
faiss_ids = [int(i) for i in indices[0] if i != -1]
|
| 397 |
+
|
| 398 |
+
# Fetch metadata from SQLite
|
| 399 |
+
retrieved_chunks = fetch_chunks_by_faiss_ids(faiss_ids)
|
| 400 |
+
|
| 401 |
+
return retrieved_chunks
|
| 402 |
+
|
| 403 |
+
# ========================================================
|
| 404 |
+
# 🔄 Rebuild FAISS Index From SQLite on Server Start
|
| 405 |
+
# ========================================================
|
| 406 |
+
|
| 407 |
+
def rebuild_faiss_index():
|
| 408 |
+
global VECTOR_INDEX, EMBEDDING_MODEL, INDEX_READY
|
| 409 |
+
|
| 410 |
+
with FAISS_LOCK:
|
| 411 |
+
|
| 412 |
+
INDEX_READY = False
|
| 413 |
+
|
| 414 |
+
conn = get_db()
|
| 415 |
+
cursor = conn.cursor()
|
| 416 |
+
|
| 417 |
+
cursor.execute("""
|
| 418 |
+
SELECT faiss_id, text
|
| 419 |
+
FROM document_chunks
|
| 420 |
+
ORDER BY faiss_id
|
| 421 |
+
""")
|
| 422 |
+
|
| 423 |
+
rows = cursor.fetchall()
|
| 424 |
+
conn.close()
|
| 425 |
+
|
| 426 |
+
if not rows:
|
| 427 |
+
VECTOR_INDEX = None
|
| 428 |
+
INDEX_READY = True
|
| 429 |
+
|
| 430 |
+
if os.path.exists(FAISS_INDEX_PATH):
|
| 431 |
+
os.remove(FAISS_INDEX_PATH)
|
| 432 |
+
|
| 433 |
+
print("No documents found. FAISS cleared and file removed.")
|
| 434 |
+
return
|
| 435 |
+
|
| 436 |
+
texts = [row[1] for row in rows]
|
| 437 |
+
ids = [row[0] for row in rows]
|
| 438 |
+
|
| 439 |
+
embeddings = EMBEDDING_MODEL.encode(texts)
|
| 440 |
+
embeddings = np.array(embeddings).astype("float32")
|
| 441 |
+
|
| 442 |
+
dimension = embeddings.shape[1]
|
| 443 |
+
|
| 444 |
+
base_index = faiss.IndexFlatL2(dimension)
|
| 445 |
+
VECTOR_INDEX = faiss.IndexIDMap(base_index)
|
| 446 |
+
|
| 447 |
+
VECTOR_INDEX.add_with_ids(
|
| 448 |
+
embeddings,
|
| 449 |
+
np.array(ids, dtype="int64")
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
INDEX_READY = True
|
| 453 |
+
|
| 454 |
+
print(f"FAISS index rebuilt with {len(texts)} chunks.")
|
| 455 |
+
|
| 456 |
+
faiss.write_index(VECTOR_INDEX, FAISS_INDEX_PATH)
|
| 457 |
+
print("FAISS index saved to disk.")
|
| 458 |
+
INDEX_READY = True
|
| 459 |
+
|
| 460 |
+
# In[]
|
| 461 |
+
|
| 462 |
+
@app.on_event("startup")
|
| 463 |
+
def startup_event():
|
| 464 |
+
print("Server started successfully")
|
| 465 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 466 |
+
threading.Thread(target=rebuild_faiss_index).start()
|
| 467 |
+
|
| 468 |
+
# In[]
|
| 469 |
+
# ============================================
|
| 470 |
+
# Save uploaded file
|
| 471 |
+
# ============================================
|
| 472 |
+
async def save_file(file: UploadFile):
|
| 473 |
+
filename = Path(file.filename).name
|
| 474 |
+
file_path = os.path.join(UPLOAD_FOLDER, filename)
|
| 475 |
+
|
| 476 |
+
if os.path.exists(file_path):
|
| 477 |
+
raise HTTPException(
|
| 478 |
+
status_code=400,
|
| 479 |
+
detail="File already exists. Please rename or delete the existing file."
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
with open(file_path, "wb") as buffer:
|
| 483 |
+
while chunk := await file.read(1024 * 1024):
|
| 484 |
+
buffer.write(chunk)
|
| 485 |
+
|
| 486 |
+
await file.seek(0) # reset pointer (important)
|
| 487 |
+
|
| 488 |
+
return file_path
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
# In[]
|
| 492 |
+
|
| 493 |
+
# =====================================
|
| 494 |
+
# 👤 User Registration
|
| 495 |
+
# =====================================
|
| 496 |
+
class UserRegister(BaseModel):
|
| 497 |
+
username: str
|
| 498 |
+
password: str
|
| 499 |
+
|
| 500 |
+
@app.post("/register")
|
| 501 |
+
def register(user: UserRegister):
|
| 502 |
+
conn = get_db()
|
| 503 |
+
cursor = conn.cursor()
|
| 504 |
+
|
| 505 |
+
hashed_pw = hash_password(user.password)
|
| 506 |
+
|
| 507 |
+
try:
|
| 508 |
+
cursor.execute(
|
| 509 |
+
"INSERT INTO users (username, hashed_password, created_at) VALUES (?, ?, ?)",
|
| 510 |
+
(user.username, hashed_pw, datetime.now().isoformat())
|
| 511 |
+
)
|
| 512 |
+
conn.commit()
|
| 513 |
+
except sqlite3.IntegrityError:
|
| 514 |
+
conn.close()
|
| 515 |
+
raise HTTPException(status_code=400, detail="Username already exists")
|
| 516 |
+
|
| 517 |
+
conn.close()
|
| 518 |
+
return {"message": "User registered successfully"}
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
# =====================================
|
| 522 |
+
# 🔑 Login Endpoint
|
| 523 |
+
# =====================================
|
| 524 |
+
|
| 525 |
+
@app.post("/login")
|
| 526 |
+
def login(form_data: OAuth2PasswordRequestForm = Depends()):
|
| 527 |
+
# --- Check hardcoded admin first ---
|
| 528 |
+
if form_data.username == ADMIN_USERNAME and verify_password(form_data.password, ADMIN_PASSWORD_HASH):
|
| 529 |
+
access_token = create_access_token(data={"sub": ADMIN_USERNAME})
|
| 530 |
+
return {"access_token": access_token, "token_type": "bearer"}
|
| 531 |
+
|
| 532 |
+
# --- Otherwise fallback to database users ---
|
| 533 |
+
conn = get_db()
|
| 534 |
+
cursor = conn.cursor()
|
| 535 |
+
cursor.execute(
|
| 536 |
+
"SELECT id, username, hashed_password FROM users WHERE username = ?",
|
| 537 |
+
(form_data.username,)
|
| 538 |
+
)
|
| 539 |
+
user = cursor.fetchone()
|
| 540 |
+
conn.close()
|
| 541 |
+
|
| 542 |
+
if not user:
|
| 543 |
+
raise HTTPException(status_code=400, detail="Invalid credentials")
|
| 544 |
+
|
| 545 |
+
user_id, username, hashed_password = user
|
| 546 |
+
|
| 547 |
+
if not verify_password(form_data.password, hashed_password):
|
| 548 |
+
raise HTTPException(status_code=400, detail="Invalid credentials")
|
| 549 |
+
|
| 550 |
+
access_token = create_access_token(data={"sub": username})
|
| 551 |
+
return {"access_token": access_token, "token_type": "bearer"}
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
# =======================================================================
|
| 555 |
+
# 🔐 Upload Files Using FastAPI User Interface and Split in to Chunks
|
| 556 |
+
# =======================================================================
|
| 557 |
+
@app.post("/upload")
|
| 558 |
+
async def upload_file(file: UploadFile = File(...),current_user: str = Depends(get_current_user)):
|
| 559 |
+
#registers the endpoint, upload → upload_file() in a routing table
|
| 560 |
+
#Uses FAISS logic,SQLite logic, chunking logic
|
| 561 |
+
#Single ingestion system. does chunking,embedding,FAISS update,DB storage
|
| 562 |
+
|
| 563 |
+
global VECTOR_INDEX, EMBEDDING_MODEL, INDEX_READY
|
| 564 |
+
|
| 565 |
+
file_path = await save_file(file)
|
| 566 |
+
filename = Path(file.filename).name.strip() #new_chunks = [c for c in all_chunks if c["source"].strip() == filename]handle & space etc
|
| 567 |
+
# Chunk documents
|
| 568 |
+
chunker = DocChunker(doc_folder=UPLOAD_FOLDER)
|
| 569 |
+
all_chunks = chunker.chunk_documents()
|
| 570 |
+
|
| 571 |
+
# Only new file chunks
|
| 572 |
+
new_chunks = [c for c in all_chunks if c["source"].strip() == filename]
|
| 573 |
+
|
| 574 |
+
if not new_chunks:
|
| 575 |
+
return {"message": "No text extracted from document."}
|
| 576 |
+
#return the above to client which called this function as a JSON with message
|
| 577 |
+
|
| 578 |
+
new_texts = [clean_text(chunk["text"]) for chunk in new_chunks]
|
| 579 |
+
|
| 580 |
+
# Encode new chunks
|
| 581 |
+
new_embeddings = EMBEDDING_MODEL.encode(new_texts)
|
| 582 |
+
new_embeddings = np.array(new_embeddings).astype("float32")
|
| 583 |
+
|
| 584 |
+
# Determine FAISS ids using SQLite
|
| 585 |
+
start_id = get_next_faiss_id()
|
| 586 |
+
faiss_ids = list(range(start_id, start_id + len(new_embeddings)))
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
#Update FAISS FIRST
|
| 590 |
+
with FAISS_LOCK:
|
| 591 |
+
if VECTOR_INDEX is None:
|
| 592 |
+
dimension = new_embeddings.shape[1]
|
| 593 |
+
base_index = faiss.IndexFlatL2(dimension)
|
| 594 |
+
VECTOR_INDEX = faiss.IndexIDMap(base_index)
|
| 595 |
+
|
| 596 |
+
VECTOR_INDEX.add_with_ids(
|
| 597 |
+
new_embeddings,
|
| 598 |
+
np.array(faiss_ids, dtype="int64")
|
| 599 |
+
)
|
| 600 |
+
faiss.write_index(VECTOR_INDEX, FAISS_INDEX_PATH)
|
| 601 |
+
INDEX_READY = True
|
| 602 |
+
#Store metadata AFTER FAISS succeeds.
|
| 603 |
+
store_chunks_in_db(new_chunks, faiss_ids)
|
| 604 |
+
#return the following to client which called this function as a JSON with message
|
| 605 |
+
return {
|
| 606 |
+
"message": f"{file.filename} uploaded and indexed successfully",
|
| 607 |
+
"chunks_added": len(new_chunks)
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
@app.post("/admin/upload-document")
|
| 611 |
+
async def upload_document(
|
| 612 |
+
file: UploadFile = File(...),
|
| 613 |
+
current_user: str = Depends(get_current_user)
|
| 614 |
+
):
|
| 615 |
+
try:
|
| 616 |
+
return await upload_file(file, current_user)
|
| 617 |
+
except Exception as e:
|
| 618 |
+
# Always return a JSON with 'message' so client alert works
|
| 619 |
+
return {"message": f"Upload failed: {str(e)}"}
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
# In[]
|
| 623 |
+
# =====================================
|
| 624 |
+
# 📦 Request Models
|
| 625 |
+
# =====================================
|
| 626 |
+
class Message(BaseModel):
|
| 627 |
+
role: str
|
| 628 |
+
content: str
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
class ChatRequest(BaseModel):
|
| 632 |
+
model: Optional[str] = None
|
| 633 |
+
messages: List[Message]
|
| 634 |
+
temperature: Optional[float] = 0.7
|
| 635 |
+
reference_style: Optional[str] = "both"
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
# =====================================
|
| 639 |
+
# 🌍 Tamil Detection
|
| 640 |
+
# =====================================
|
| 641 |
+
def contains_tamil(text: str) -> bool:
|
| 642 |
+
return bool(re.search(r'[\u0B80-\u0BFF]', text))
|
| 643 |
+
|
| 644 |
+
# In[]
|
| 645 |
+
|
| 646 |
+
# =====================================
|
| 647 |
+
# 🛡 Retrieval Evidence Detection
|
| 648 |
+
# =====================================
|
| 649 |
+
def has_retrieved_context(messages: List[Message]) -> bool:
|
| 650 |
+
"""
|
| 651 |
+
Detects whether WebUI injected retrieved document context.
|
| 652 |
+
Looks for common RAG markers like 'Source', 'Page', etc.
|
| 653 |
+
"""
|
| 654 |
+
for m in messages:
|
| 655 |
+
content = m.content.lower()
|
| 656 |
+
if "source:" in content or "page" in content or "document:" in content:
|
| 657 |
+
return True
|
| 658 |
+
return False
|
| 659 |
+
|
| 660 |
+
|
| 661 |
+
def refusal_response(reason: str):
|
| 662 |
+
return {
|
| 663 |
+
"id": "chatcmpl-local",
|
| 664 |
+
"object": "chat.completion",
|
| 665 |
+
"created": int(datetime.now().timestamp()),
|
| 666 |
+
"model": "control-layer",
|
| 667 |
+
"choices": [
|
| 668 |
+
{
|
| 669 |
+
"index": 0,
|
| 670 |
+
"message": {
|
| 671 |
+
"role": "assistant",
|
| 672 |
+
"content": reason
|
| 673 |
+
},
|
| 674 |
+
"finish_reason": "stop"
|
| 675 |
+
}
|
| 676 |
+
],
|
| 677 |
+
"usage": {
|
| 678 |
+
"prompt_tokens": 0,
|
| 679 |
+
"completion_tokens": 0,
|
| 680 |
+
"total_tokens": 0
|
| 681 |
+
}
|
| 682 |
+
}
|
| 683 |
+
|
| 684 |
+
|
| 685 |
+
def apply_reference_style(assistant_message, references_map, style):
|
| 686 |
+
|
| 687 |
+
# REMOVE existing inline references first
|
| 688 |
+
assistant_message = re.sub(r"\([^)]*\.pdf[^)]*\)", "", assistant_message, flags=re.IGNORECASE)
|
| 689 |
+
# REMOVE existing bibliography
|
| 690 |
+
assistant_message = re.sub(r"References:.*", "", assistant_message, flags=re.IGNORECASE | re.DOTALL)
|
| 691 |
+
|
| 692 |
+
# INLINE ONLY
|
| 693 |
+
if style == "inline":
|
| 694 |
+
for doc_marker, ref_text in references_map.items():
|
| 695 |
+
assistant_message = assistant_message.replace(
|
| 696 |
+
doc_marker, f"({ref_text})"
|
| 697 |
+
)
|
| 698 |
+
|
| 699 |
+
# LIST ONLY
|
| 700 |
+
elif style == "list":
|
| 701 |
+
used_markers = re.findall(r"\[Doc\d+\]", assistant_message)
|
| 702 |
+
|
| 703 |
+
assistant_message = re.sub(r"\[Doc\d+\]", "", assistant_message) # REMOVE INLINE MARKERS
|
| 704 |
+
|
| 705 |
+
refs_list = []
|
| 706 |
+
for doc_marker in used_markers:
|
| 707 |
+
if doc_marker in references_map:
|
| 708 |
+
ref = references_map[doc_marker]
|
| 709 |
+
if ref not in refs_list:
|
| 710 |
+
refs_list.append(ref)
|
| 711 |
+
|
| 712 |
+
if not refs_list:
|
| 713 |
+
refs_list = list(references_map.values())
|
| 714 |
+
|
| 715 |
+
if refs_list:
|
| 716 |
+
assistant_message = assistant_message.replace("References:", "")
|
| 717 |
+
assistant_message += "<br><br><br><b>References:</b><br>"
|
| 718 |
+
assistant_message += "<br>".join(f"- {r}" for r in refs_list)
|
| 719 |
+
|
| 720 |
+
# BOTH
|
| 721 |
+
elif style == "both":
|
| 722 |
+
used_markers = re.findall(r"\[Doc\d+\]", assistant_message)
|
| 723 |
+
|
| 724 |
+
for doc_marker, ref_text in references_map.items():
|
| 725 |
+
assistant_message = assistant_message.replace(
|
| 726 |
+
doc_marker, f"({ref_text})"
|
| 727 |
+
)
|
| 728 |
+
|
| 729 |
+
refs_list = []
|
| 730 |
+
for doc_marker in used_markers:
|
| 731 |
+
if doc_marker in references_map:
|
| 732 |
+
ref = references_map[doc_marker]
|
| 733 |
+
if ref not in refs_list:
|
| 734 |
+
refs_list.append(ref)
|
| 735 |
+
|
| 736 |
+
if refs_list:
|
| 737 |
+
assistant_message += "<br><br><br><b>References:</b><br>"
|
| 738 |
+
assistant_message += "<br>".join(f"- {r}" for r in refs_list)
|
| 739 |
+
|
| 740 |
+
# NONE
|
| 741 |
+
elif style == "none":
|
| 742 |
+
assistant_message = assistant_message.replace("References:", "")
|
| 743 |
+
|
| 744 |
+
return assistant_message
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
# =====================================
|
| 749 |
+
# 🔐 Password Utilities
|
| 750 |
+
# =====================================
|
| 751 |
+
def hash_password(password: str) -> str:
|
| 752 |
+
return pwd_context.hash(password)
|
| 753 |
+
|
| 754 |
+
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
| 755 |
+
return pwd_context.verify(plain_password, hashed_password)
|
| 756 |
+
|
| 757 |
+
def create_access_token(data: dict):
|
| 758 |
+
if "sub" not in data:
|
| 759 |
+
raise ValueError("Token data must include 'sub'")
|
| 760 |
+
|
| 761 |
+
return jwt.encode(data, SECRET_KEY, algorithm=ALGORITHM)
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
|
| 765 |
+
@app.get("/protected")
|
| 766 |
+
def protected_route(current_user: str = Depends(get_current_user)):
|
| 767 |
+
return {"message": f"Hello {current_user}"}
|
| 768 |
+
|
| 769 |
+
#root endpoint
|
| 770 |
+
@app.get("/")
|
| 771 |
+
def serve_ui():
|
| 772 |
+
return FileResponse("FastAPI_Client/index.html")
|
| 773 |
+
|
| 774 |
+
@app.get("/v1/models")
|
| 775 |
+
def list_models():
|
| 776 |
+
return {
|
| 777 |
+
"object": "list",
|
| 778 |
+
"data": [
|
| 779 |
+
{
|
| 780 |
+
"id": model,
|
| 781 |
+
"object": "model",
|
| 782 |
+
"created": 0,
|
| 783 |
+
"owned_by": "local"
|
| 784 |
+
}
|
| 785 |
+
for model in ALLOWED_MODELS
|
| 786 |
+
]
|
| 787 |
+
}
|
| 788 |
+
|
| 789 |
+
# In[]
|
| 790 |
+
|
| 791 |
+
# =====================================
|
| 792 |
+
# 💬 Chat Endpoint
|
| 793 |
+
# =====================================
|
| 794 |
+
@app.post("/v1/chat/completions")
|
| 795 |
+
async def chat_completion(request: ChatRequest):
|
| 796 |
+
|
| 797 |
+
#Model selection from WebUI
|
| 798 |
+
selected_model = request.model or DEFAULT_MODEL
|
| 799 |
+
|
| 800 |
+
if selected_model not in ALLOWED_MODELS:
|
| 801 |
+
raise HTTPException(
|
| 802 |
+
status_code=400,
|
| 803 |
+
detail=f"Model '{selected_model}' is not allowed."
|
| 804 |
+
)
|
| 805 |
+
|
| 806 |
+
user_message = request.messages[-1].content
|
| 807 |
+
# 🔎 Step 0: Retrieve relevant chunks from FAISS + SQLite
|
| 808 |
+
retrieved_chunks = retrieve_relevant_chunks(user_message, top_k=5) # returns list of dicts
|
| 809 |
+
print("Number of chunks:", len(retrieved_chunks))
|
| 810 |
+
#Build context for LLM with inline references (source attribution)
|
| 811 |
+
context_parts = []
|
| 812 |
+
for i, c in enumerate(retrieved_chunks, start=1):
|
| 813 |
+
chunk_text = clean_text(c["text"])
|
| 814 |
+
|
| 815 |
+
context_parts.append(
|
| 816 |
+
f"""
|
| 817 |
+
[Doc{i}]
|
| 818 |
+
Document: {c['source']}
|
| 819 |
+
Page: {c['page']}
|
| 820 |
+
Content:
|
| 821 |
+
{chunk_text}
|
| 822 |
+
"""
|
| 823 |
+
)
|
| 824 |
+
|
| 825 |
+
'''
|
| 826 |
+
for i, c in enumerate(retrieved_chunks, start=1):
|
| 827 |
+
chunk_text = c['text'].replace("[", "").replace("]", "")
|
| 828 |
+
context_parts.append(
|
| 829 |
+
f"[Doc{i}] {chunk_text}"
|
| 830 |
+
#f"Source [Doc{i}] | Document: {c['source']} | Page: {c['page']}\n{chunk_text}"
|
| 831 |
+
)
|
| 832 |
+
'''
|
| 833 |
+
|
| 834 |
+
|
| 835 |
+
rag_context = "\n\n".join(context_parts)
|
| 836 |
+
|
| 837 |
+
references_map = {}
|
| 838 |
+
for i, c in enumerate(retrieved_chunks, start=1):
|
| 839 |
+
#references_map[f"[Doc{i}]"] = f"{c['source']} (Page {c['page']})"
|
| 840 |
+
#references_map[f"[Doc{i}]"] = f"<a href='/uploads/{c['source']}#page={c['page']}' target='_blank'>{c['source']} (Page {c['page']})</a>"
|
| 841 |
+
references_map[f"[Doc{i}]"] = (
|
| 842 |
+
f"<a href='/uploads/{c['source']}#page={c['page']}' target='_blank'>"
|
| 843 |
+
f"{c['source']} — Page {c['page']}</a>"
|
| 844 |
+
)
|
| 845 |
+
|
| 846 |
+
|
| 847 |
+
#Hallucination Control: Evidence Gate (Pre-LLM)
|
| 848 |
+
if not retrieved_chunks:
|
| 849 |
+
print("🚫 BLOCKED BEFORE LLM CALL — No retrieved evidence detected.")
|
| 850 |
+
return refusal_response(
|
| 851 |
+
"The answer is not found in the local documents provided by KKT."
|
| 852 |
+
)
|
| 853 |
+
|
| 854 |
+
#Language handling
|
| 855 |
+
if contains_tamil(user_message):
|
| 856 |
+
system_prompt = "You are a helpful AI assistant. Always respond only in Tamil."
|
| 857 |
+
else:
|
| 858 |
+
system_prompt = "You are a helpful AI assistant. Answer ONLY using the provided document context. If the answer is not in the context, say the information is not available in the documents."
|
| 859 |
+
|
| 860 |
+
#style = request.dict().get("reference_style", "both")
|
| 861 |
+
style = request.reference_style or "both" #✅ Included newly
|
| 862 |
+
|
| 863 |
+
if style == "none":
|
| 864 |
+
citation_instruction = "STRICTLY DO NOT include any citations or markers."
|
| 865 |
+
rules_text = """
|
| 866 |
+
- Do NOT include any citation markers like [Doc1].
|
| 867 |
+
- Do NOT include any References section.
|
| 868 |
+
"""
|
| 869 |
+
elif style == "inline":
|
| 870 |
+
citation_instruction = "Include inline citation markers like [Doc1]."
|
| 871 |
+
rules_text = """
|
| 872 |
+
- Use ONLY the markers [Doc1], [Doc2], etc.
|
| 873 |
+
- Do NOT write document names yourself.
|
| 874 |
+
- Do NOT invent citations.
|
| 875 |
+
- Do NOT include any References section.
|
| 876 |
+
"""
|
| 877 |
+
elif style == "list":
|
| 878 |
+
citation_instruction = "STRICTLY DO NOT include any inline citation markers like [Doc1]."
|
| 879 |
+
rules_text = """
|
| 880 |
+
- Do NOT include any inline citation markers like [Doc1].
|
| 881 |
+
- Do NOT write document names yourself.
|
| 882 |
+
- Do NOT invent citations.
|
| 883 |
+
- Do NOT include any References section.
|
| 884 |
+
"""
|
| 885 |
+
elif style == "both":
|
| 886 |
+
citation_instruction = "STRICTLY include citation markers like [Doc1], [Doc2] in every factual sentence."
|
| 887 |
+
rules_text = """
|
| 888 |
+
- Use ONLY the markers [Doc1], [Doc2], etc.
|
| 889 |
+
- Do NOT write document names yourself.
|
| 890 |
+
- Do NOT invent citations.
|
| 891 |
+
"""
|
| 892 |
+
|
| 893 |
+
#Inject retrieved context as system message
|
| 894 |
+
system_prompt = f"""
|
| 895 |
+
You are a document-grounded AI assistant.
|
| 896 |
+
|
| 897 |
+
Answer the question ONLY using the provided context.
|
| 898 |
+
|
| 899 |
+
{citation_instruction}
|
| 900 |
+
|
| 901 |
+
Rules:
|
| 902 |
+
{rules_text}
|
| 903 |
+
|
| 904 |
+
If the answer is not present in the context, say the information is not available.
|
| 905 |
+
|
| 906 |
+
Context:
|
| 907 |
+
{rag_context}
|
| 908 |
+
"""
|
| 909 |
+
|
| 910 |
+
'''
|
| 911 |
+
#old prompt where citations were not displayed as per check box selection
|
| 912 |
+
system_prompt = f"""
|
| 913 |
+
You are a document-grounded AI assistant.
|
| 914 |
+
|
| 915 |
+
Answer the question ONLY using the provided context.
|
| 916 |
+
|
| 917 |
+
Citation Rules:
|
| 918 |
+
1. Every factual statement MUST include a citation marker.
|
| 919 |
+
2. Use ONLY the markers [Doc1], [Doc2], etc.
|
| 920 |
+
3. Copy the marker EXACTLY as written.
|
| 921 |
+
4. Do NOT write document names yourself.
|
| 922 |
+
5. Do NOT invent citations.
|
| 923 |
+
|
| 924 |
+
If the answer is not present in the context, say the information is not available.
|
| 925 |
+
|
| 926 |
+
Context:
|
| 927 |
+
{rag_context}
|
| 928 |
+
"""
|
| 929 |
+
'''
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
|
| 933 |
+
# 3 Forward request to Ollama
|
| 934 |
+
#final_messages = [{"role": "system", "content": system_prompt}]
|
| 935 |
+
final_messages = [
|
| 936 |
+
{
|
| 937 |
+
"role": "system",
|
| 938 |
+
"content": system_prompt
|
| 939 |
+
}
|
| 940 |
+
]
|
| 941 |
+
|
| 942 |
+
|
| 943 |
+
|
| 944 |
+
for m in request.messages:
|
| 945 |
+
final_messages.append({
|
| 946 |
+
"role": m.role.lower(),
|
| 947 |
+
"content": m.content
|
| 948 |
+
})
|
| 949 |
+
print("MODEL:", selected_model)
|
| 950 |
+
print("AVAILABLE MODELS:", ALLOWED_MODELS)
|
| 951 |
+
print("SENDING TO GROQ:", final_messages)
|
| 952 |
+
print("GROQ KEY:", os.getenv("GROQ_API_KEY"))
|
| 953 |
+
|
| 954 |
+
try:
|
| 955 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 956 |
+
response = await client.post(
|
| 957 |
+
url="https://api.groq.com/openai/v1/chat/completions", #uses cloud server
|
| 958 |
+
headers={
|
| 959 |
+
"Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}", #environment variable set using Windows Power Shell
|
| 960 |
+
"Content-Type": "application/json"
|
| 961 |
+
},
|
| 962 |
+
json={
|
| 963 |
+
"model": selected_model,
|
| 964 |
+
"messages": final_messages,
|
| 965 |
+
"temperature": request.temperature,
|
| 966 |
+
"stream": False
|
| 967 |
+
}
|
| 968 |
+
)
|
| 969 |
+
|
| 970 |
+
|
| 971 |
+
if response.status_code != 200:
|
| 972 |
+
print("STATUS:", response.status_code)
|
| 973 |
+
print("ERROR:", response.text)
|
| 974 |
+
raise HTTPException(status_code=500, detail="Groq API Error")
|
| 975 |
+
result = response.json()
|
| 976 |
+
#assistant_message = result["message"]["content"] #This works for Ollama not for Groq
|
| 977 |
+
#assistant_message = result["choices"][0]["message"]["content"]
|
| 978 |
+
|
| 979 |
+
assistant_message = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
| 980 |
+
|
| 981 |
+
# Determine reference style: inline, list, or both
|
| 982 |
+
#style = request.dict().get("reference_style", "both").lower()
|
| 983 |
+
style = (request.reference_style or "both").lower()
|
| 984 |
+
if style not in ["inline", "list", "both", "none"]:
|
| 985 |
+
style = "both"
|
| 986 |
+
assistant_message = apply_reference_style(assistant_message, references_map, style)
|
| 987 |
+
|
| 988 |
+
|
| 989 |
+
except Exception as e:
|
| 990 |
+
print("FULL ERROR:", e)
|
| 991 |
+
raise
|
| 992 |
+
|
| 993 |
+
|
| 994 |
+
#Return OpenAI-compatible response
|
| 995 |
+
return {
|
| 996 |
+
"id": "chatcmpl-local",
|
| 997 |
+
"object": "chat.completion",
|
| 998 |
+
"created": int(datetime.now().timestamp()),
|
| 999 |
+
"model": selected_model,
|
| 1000 |
+
"choices": [
|
| 1001 |
+
{
|
| 1002 |
+
"index": 0,
|
| 1003 |
+
"message": {
|
| 1004 |
+
"role": "assistant",
|
| 1005 |
+
"content": assistant_message
|
| 1006 |
+
},
|
| 1007 |
+
"finish_reason": "stop"
|
| 1008 |
+
}
|
| 1009 |
+
],
|
| 1010 |
+
"usage": {
|
| 1011 |
+
"prompt_tokens": 0,
|
| 1012 |
+
"completion_tokens": 0,
|
| 1013 |
+
"total_tokens": 0
|
| 1014 |
+
}
|
| 1015 |
+
}
|
| 1016 |
+
|
| 1017 |
+
|
| 1018 |
+
|
| 1019 |
+
|
config.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Mon Apr 13 19:22:03 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
#NOTE
|
| 8 |
+
# All uploaded files are saved in the doc_ingestion folder where the server file exixsts
|
| 9 |
+
|
| 10 |
+
#BASE_DIR -- full absolute directory path of the current script file
|
| 11 |
+
#UPLOAD_FOLDER -- This folder is created if not available see startup_event() in the server file
|
| 12 |
+
#DB_FILE = "kkt_SQLite_DB.db"
|
| 13 |
+
#DB_PATH_FILE -- database file config.db in the same directory as this script (BASE_DIR)
|
| 14 |
+
#FAISS_INDEX_PATH = os.path.join(BASE_DIR, "faiss.index")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 20 |
+
|
| 21 |
+
UPLOAD_FOLDER = os.path.join(BASE_DIR, "doc_ingestion") #Folder is created in the startup_event() in the server file
|
| 22 |
+
DB_FILE = "kkt_SQLite_DB.db"
|
| 23 |
+
DB_PATH_FILE = os.path.join(BASE_DIR, DB_FILE) #this path is used in the server file to connect the dtata base
|
| 24 |
+
|
| 25 |
+
FAISS_INDEX_PATH = os.path.join(BASE_DIR, "faiss.index")
|
doc_ingestion/AIML_Unit1_RMD_ECE.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e12a289078f97daafeade764a58c9c217b5d25d2ba69c056bbf4f338046cad46
|
| 3 |
+
size 2663371
|
doc_ingestion/AIML_Unit2_RMD_ECE.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d68a2034ebc91200784fd39eae4818f407811e26e9d134183b710d2f3c28f663
|
| 3 |
+
size 3598628
|
faiss.index
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6e5420cf84512c0c5438b7c21831c098ba4a155f1c5b47ae2013f538732472e
|
| 3 |
+
size 1085522
|
kkt_SQLite_DB.db
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4cd7fa92c9fec7ae91c60ac2c43f7e039dc8a9a8fb1498602987035caf4158c
|
| 3 |
+
size 389120
|
rag/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Wed Mar 4 12:55:04 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
|
rag/chunker.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Tue Mar 3 16:41:30 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
|
| 7 |
+
Reads PDFs from kkt_AIML_PDFs/
|
| 8 |
+
Chunk into fixed size segments
|
| 9 |
+
Return list of chunks with metadata
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
from typing import List, Dict
|
| 14 |
+
from pypdf import PdfReader
|
| 15 |
+
from docx import Document
|
| 16 |
+
import pandas as pd
|
| 17 |
+
from PIL import Image
|
| 18 |
+
import pytesseract
|
| 19 |
+
import cv2
|
| 20 |
+
from pytesseract import Output
|
| 21 |
+
|
| 22 |
+
from utils.text_cleanerV2 import clean_text
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
import shutil
|
| 26 |
+
|
| 27 |
+
tesseract_path = shutil.which("tesseract")
|
| 28 |
+
if tesseract_path:
|
| 29 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class DocChunker:
|
| 33 |
+
"""
|
| 34 |
+
Handles document ingestion and text chunking.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, doc_folder: str, chunk_size: int = 500, overlap: int = 50):
|
| 38 |
+
self.doc_folder = doc_folder
|
| 39 |
+
self.chunk_size = chunk_size
|
| 40 |
+
self.overlap = overlap
|
| 41 |
+
|
| 42 |
+
# ---------------------------------------------------
|
| 43 |
+
# Load and Parse Documents (PDF, DOCX, Excel, Images)
|
| 44 |
+
# ---------------------------------------------------
|
| 45 |
+
|
| 46 |
+
def load_pdfs(self) -> List[Dict]:
|
| 47 |
+
"""
|
| 48 |
+
Reads all supported documents and returns page-level texts with metadata.
|
| 49 |
+
(Method name preserved for compatibility.)
|
| 50 |
+
"""
|
| 51 |
+
documents = []
|
| 52 |
+
|
| 53 |
+
for filename in os.listdir(self.doc_folder):
|
| 54 |
+
file_path = os.path.join(self.doc_folder, filename)
|
| 55 |
+
ext = filename.lower().split(".")[-1]
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# ---------------- PDF ----------------
|
| 59 |
+
if ext == "pdf":
|
| 60 |
+
reader = PdfReader(file_path)
|
| 61 |
+
for page_number, page in enumerate(reader.pages, start=1):
|
| 62 |
+
text = page.extract_text()
|
| 63 |
+
if text:
|
| 64 |
+
documents.append({
|
| 65 |
+
"text": text.strip(),
|
| 66 |
+
"source": filename,
|
| 67 |
+
"page": page_number
|
| 68 |
+
})
|
| 69 |
+
|
| 70 |
+
# ---------------- DOCX ----------------
|
| 71 |
+
elif ext == "docx":
|
| 72 |
+
doc = Document(file_path)
|
| 73 |
+
full_text = "\n".join([p.text for p in doc.paragraphs])
|
| 74 |
+
documents.append({
|
| 75 |
+
"text": full_text.strip(),
|
| 76 |
+
"source": filename,
|
| 77 |
+
"page": 1
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
# ---------------- Excel ----------------
|
| 81 |
+
elif ext in ["xlsx", "xls"]:
|
| 82 |
+
df = pd.read_excel(file_path)
|
| 83 |
+
documents.append({
|
| 84 |
+
"text": df.to_string(),
|
| 85 |
+
"source": filename,
|
| 86 |
+
"page": 1
|
| 87 |
+
})
|
| 88 |
+
|
| 89 |
+
# ---------------- Image (OCR) ----------------
|
| 90 |
+
elif ext in ["png", "jpg", "jpeg"]:
|
| 91 |
+
|
| 92 |
+
# Read image with OpenCV
|
| 93 |
+
img = cv2.imread(file_path)
|
| 94 |
+
|
| 95 |
+
# Detect orientation
|
| 96 |
+
osd = pytesseract.image_to_osd(img, output_type=Output.DICT)
|
| 97 |
+
angle = osd["rotate"]
|
| 98 |
+
|
| 99 |
+
if angle == 90:
|
| 100 |
+
img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
|
| 101 |
+
elif angle == 180:
|
| 102 |
+
img = cv2.rotate(img, cv2.ROTATE_180)
|
| 103 |
+
elif angle == 270:
|
| 104 |
+
img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
|
| 105 |
+
|
| 106 |
+
# Convert to grayscale
|
| 107 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 108 |
+
|
| 109 |
+
# Resize for better OCR
|
| 110 |
+
gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
|
| 111 |
+
|
| 112 |
+
# Apply threshold
|
| 113 |
+
thresh = cv2.threshold(
|
| 114 |
+
gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
|
| 115 |
+
)[1]
|
| 116 |
+
|
| 117 |
+
# OCR
|
| 118 |
+
text = pytesseract.image_to_string(thresh, config="--psm 6")
|
| 119 |
+
|
| 120 |
+
documents.append({
|
| 121 |
+
"text": text.strip(),
|
| 122 |
+
"source": filename,
|
| 123 |
+
"page": 1
|
| 124 |
+
})
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"Error processing {filename}: {e}")
|
| 128 |
+
|
| 129 |
+
return documents
|
| 130 |
+
|
| 131 |
+
# ---------------------------------------------------
|
| 132 |
+
# Chunk Text
|
| 133 |
+
# ---------------------------------------------------
|
| 134 |
+
|
| 135 |
+
def chunk_documents(self) -> List[Dict]:
|
| 136 |
+
"""
|
| 137 |
+
Splits document text into smaller chunks.
|
| 138 |
+
Returns list of chunks with metadata.
|
| 139 |
+
"""
|
| 140 |
+
pages = self.load_pdfs()
|
| 141 |
+
chunks = []
|
| 142 |
+
|
| 143 |
+
for page in pages:
|
| 144 |
+
raw_text = page["text"]
|
| 145 |
+
cleaned_text = clean_text(raw_text)
|
| 146 |
+
|
| 147 |
+
start = 0
|
| 148 |
+
while start < len(cleaned_text):
|
| 149 |
+
end = start + self.chunk_size
|
| 150 |
+
chunk_text = cleaned_text[start:end]
|
| 151 |
+
|
| 152 |
+
chunks.append({
|
| 153 |
+
"text": chunk_text,
|
| 154 |
+
"source": page["source"],
|
| 155 |
+
"page": page["page"]
|
| 156 |
+
})
|
| 157 |
+
|
| 158 |
+
start += self.chunk_size - self.overlap
|
| 159 |
+
|
| 160 |
+
return chunks
|
rag/qdrant_retriever.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Tue Mar 3 14:40:03 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from rag.base_retriever import BaseRetriever
|
| 9 |
+
|
| 10 |
+
class QdrantRetriever(BaseRetriever):
|
requirements.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
httpx
|
| 4 |
+
pydantic
|
| 5 |
+
python-multipart
|
| 6 |
+
|
| 7 |
+
sentence-transformers
|
| 8 |
+
faiss-cpu
|
| 9 |
+
|
| 10 |
+
pypdf
|
| 11 |
+
python-docx
|
| 12 |
+
pandas
|
| 13 |
+
|
| 14 |
+
pytesseract
|
| 15 |
+
opencv-python-headless
|
| 16 |
+
Pillow
|
| 17 |
+
|
| 18 |
+
passlib[bcrypt]
|
| 19 |
+
python-jose[cryptography]
|
| 20 |
+
requests
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Wed Mar 4 12:55:04 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
|
utils/admin_fns.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Sat Apr 11 15:03:04 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
#line 20 import avoids circular error
|
| 8 |
+
from utils.core_imports import get_current_user
|
| 9 |
+
|
| 10 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 11 |
+
import os
|
| 12 |
+
import shutil
|
| 13 |
+
import threading #used for rebuild_faiss_index line 54
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
router = APIRouter() #the app acted as a proxy to Ollama in main file
|
| 17 |
+
|
| 18 |
+
@router.delete("/admin/delete-document")
|
| 19 |
+
def delete_document(filename: str,current_user: str = Depends(get_current_user)):
|
| 20 |
+
from utils.core_imports import get_db, rebuild_faiss_index, get_upload_folder #lazy import
|
| 21 |
+
|
| 22 |
+
filename = filename.strip()
|
| 23 |
+
|
| 24 |
+
conn = get_db()
|
| 25 |
+
cursor = conn.cursor()
|
| 26 |
+
|
| 27 |
+
# Check existence
|
| 28 |
+
cursor.execute(
|
| 29 |
+
"SELECT faiss_id FROM document_chunks WHERE TRIM(source)=?",
|
| 30 |
+
(filename,)
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
rows = cursor.fetchall()
|
| 34 |
+
|
| 35 |
+
if not rows:
|
| 36 |
+
conn.close()
|
| 37 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 38 |
+
|
| 39 |
+
# ✅ DELETE (this was missing in your code)
|
| 40 |
+
cursor.execute(
|
| 41 |
+
"DELETE FROM document_chunks WHERE TRIM(source)=?",
|
| 42 |
+
(filename,)
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
conn.commit()
|
| 46 |
+
conn.close()
|
| 47 |
+
|
| 48 |
+
# Delete physical file
|
| 49 |
+
file_path = os.path.join(get_upload_folder(), filename)
|
| 50 |
+
if os.path.exists(file_path):
|
| 51 |
+
os.remove(file_path)
|
| 52 |
+
rebuild_faiss_index()
|
| 53 |
+
|
| 54 |
+
# Rebuild FAISS in background
|
| 55 |
+
threading.Thread(target=rebuild_faiss_index).start()
|
| 56 |
+
|
| 57 |
+
'''
|
| 58 |
+
#used for debugging. Found & was not converted to %26
|
| 59 |
+
print(f"Incoming filename: [{filename}]")
|
| 60 |
+
|
| 61 |
+
cursor.execute("SELECT DISTINCT source FROM document_chunks")
|
| 62 |
+
all_sources = cursor.fetchall()
|
| 63 |
+
|
| 64 |
+
print("DB sources:")
|
| 65 |
+
for s in all_sources:
|
| 66 |
+
print(f"[{s[0]}]")
|
| 67 |
+
'''
|
| 68 |
+
|
| 69 |
+
return {"message": f"{filename} removed from index"}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@router.delete("/admin/delete-folder")
|
| 74 |
+
def delete_folder(folder: str,current_user: str = Depends(get_current_user)):
|
| 75 |
+
from utils.core_imports import get_db, rebuild_faiss_index
|
| 76 |
+
conn = get_db()
|
| 77 |
+
cursor = conn.cursor()
|
| 78 |
+
|
| 79 |
+
cursor.execute(
|
| 80 |
+
"DELETE FROM document_chunks WHERE source LIKE ?",
|
| 81 |
+
(f"%{folder}%",)
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
deleted_count = cursor.rowcount
|
| 85 |
+
conn.commit()
|
| 86 |
+
conn.close()
|
| 87 |
+
if deleted_count == 0:
|
| 88 |
+
raise HTTPException(status_code=404, detail="Folder not found")
|
| 89 |
+
|
| 90 |
+
threading.Thread(target=rebuild_faiss_index).start()
|
| 91 |
+
|
| 92 |
+
return {"message": f"{folder} folder removed from index"}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@router.delete("/admin/reset-index")
|
| 97 |
+
def reset_index(confirm: bool = False,current_user: str = Depends(get_current_user)):
|
| 98 |
+
from utils.core_imports import get_db, rebuild_faiss_index, get_upload_folder #lazy import to avoid circular import
|
| 99 |
+
#confirm button will be displayed
|
| 100 |
+
if not confirm:
|
| 101 |
+
return {"message": "Set confirm=true to reset index"}
|
| 102 |
+
|
| 103 |
+
conn = get_db()
|
| 104 |
+
cursor = conn.cursor()
|
| 105 |
+
|
| 106 |
+
cursor.execute("DELETE FROM document_chunks")
|
| 107 |
+
# delete ALL rows in document_chunks table
|
| 108 |
+
|
| 109 |
+
conn.commit()
|
| 110 |
+
conn.close()
|
| 111 |
+
|
| 112 |
+
#delete the files in the UPLOAD dir doc_ingestion folder
|
| 113 |
+
upload_dir = get_upload_folder()
|
| 114 |
+
|
| 115 |
+
shutil.rmtree(upload_dir)
|
| 116 |
+
os.makedirs(upload_dir, exist_ok=True)
|
| 117 |
+
|
| 118 |
+
threading.Thread(target=rebuild_faiss_index).start()
|
| 119 |
+
|
| 120 |
+
return {"message": "Index reset completed"}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@router.get("/admin/list-documents")
|
| 125 |
+
def list_documents(current_user: str = Depends(get_current_user)):
|
| 126 |
+
from utils.core_imports import get_db
|
| 127 |
+
|
| 128 |
+
conn = get_db()
|
| 129 |
+
cursor = conn.cursor()
|
| 130 |
+
|
| 131 |
+
cursor.execute("""
|
| 132 |
+
SELECT source, COUNT(*) as chunks
|
| 133 |
+
FROM document_chunks
|
| 134 |
+
GROUP BY source
|
| 135 |
+
""")
|
| 136 |
+
|
| 137 |
+
rows = cursor.fetchall()
|
| 138 |
+
conn.close()
|
| 139 |
+
|
| 140 |
+
docs = [{"document": r[0], "chunks": r[1]} for r in rows]
|
| 141 |
+
|
| 142 |
+
return {"documents": docs}
|
utils/core_imports.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Sun Apr 12 22:31:05 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
#NOTE
|
| 8 |
+
#If you chnage the kkt_FastAPI_server file name, then change the SERVER_MODULE name here
|
| 9 |
+
|
| 10 |
+
# =====================================
|
| 11 |
+
# 🔁 Centralized Server Module Import
|
| 12 |
+
# =====================================
|
| 13 |
+
SERVER_MODULE = "app"
|
| 14 |
+
|
| 15 |
+
import importlib
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _get_server():
|
| 19 |
+
return importlib.import_module(SERVER_MODULE)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# =====================================
|
| 23 |
+
# 🔁 Lazy Re-export functions
|
| 24 |
+
# =====================================
|
| 25 |
+
|
| 26 |
+
def get_db():
|
| 27 |
+
return _get_server().get_db()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def rebuild_faiss_index():
|
| 31 |
+
return _get_server().rebuild_faiss_index()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_current_user():
|
| 35 |
+
return _get_server().get_current_user
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_upload_folder():
|
| 39 |
+
return _get_server().UPLOAD_FOLDER
|
utils/text_cleanerV1.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Wed Mar 4 12:25:04 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
def clean_text(text: str) -> str:
|
| 11 |
+
"""
|
| 12 |
+
Basic PDF text cleaning for RAG.
|
| 13 |
+
Removes URLs, repeated lines, extra whitespace, and noise.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
# Remove URLs
|
| 17 |
+
text = re.sub(r"http\S+", "", text)
|
| 18 |
+
|
| 19 |
+
# Remove standalone dates like 02-03-2026
|
| 20 |
+
text = re.sub(r"\b\d{2}-\d{2}-\d{4}\b", "", text)
|
| 21 |
+
|
| 22 |
+
# Remove QR instruction lines
|
| 23 |
+
text = re.sub(r"Scan the QR code.*", "", text, flags=re.IGNORECASE)
|
| 24 |
+
|
| 25 |
+
# Remove extra spaces
|
| 26 |
+
text = re.sub(r"\s+", " ", text)
|
| 27 |
+
|
| 28 |
+
# Remove duplicate consecutive words
|
| 29 |
+
words = text.split()
|
| 30 |
+
cleaned_words = []
|
| 31 |
+
prev_word = None
|
| 32 |
+
for word in words:
|
| 33 |
+
if word != prev_word:
|
| 34 |
+
cleaned_words.append(word)
|
| 35 |
+
prev_word = word
|
| 36 |
+
|
| 37 |
+
return " ".join(cleaned_words).strip()
|
utils/text_cleanerV2.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Wed Mar 4 12:25:04 2026
|
| 4 |
+
|
| 5 |
+
@author: THYAGHARAJAN
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
import unicodedata
|
| 10 |
+
|
| 11 |
+
def clean_text(text: str) -> str:
|
| 12 |
+
"""
|
| 13 |
+
Main cleaning pipeline.
|
| 14 |
+
|
| 15 |
+
Order matters.
|
| 16 |
+
"""
|
| 17 |
+
if not text:
|
| 18 |
+
return ""
|
| 19 |
+
|
| 20 |
+
text = normalize_unicode(text)
|
| 21 |
+
text = remove_non_printable(text)
|
| 22 |
+
text = remove_headers_footers(text)
|
| 23 |
+
text = remove_page_numbers(text)
|
| 24 |
+
text = remove_extra_whitespace(text)
|
| 25 |
+
text = remove_duplicate_words(text)
|
| 26 |
+
|
| 27 |
+
return text
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def normalize_unicode(text: str) -> str:
|
| 32 |
+
"""
|
| 33 |
+
Normalize unicode characters to a consistent form.
|
| 34 |
+
Prevents strange PDF extraction artifacts.
|
| 35 |
+
"""
|
| 36 |
+
return unicodedata.normalize("NFKC", text)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def remove_extra_whitespace(text: str) -> str:
|
| 40 |
+
"""
|
| 41 |
+
Remove excessive spaces, tabs, and line breaks.
|
| 42 |
+
"""
|
| 43 |
+
text = re.sub(r"[ \t]+", " ", text) # collapse spaces
|
| 44 |
+
text = re.sub(r"\n\s*\n+", "\n\n", text) # max 2 newlines
|
| 45 |
+
return text.strip()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def remove_page_numbers(text: str) -> str:
|
| 49 |
+
"""
|
| 50 |
+
Remove standalone page numbers.
|
| 51 |
+
Example: '12', '- 23 -', 'Page 5'
|
| 52 |
+
"""
|
| 53 |
+
text = re.sub(r"\n\s*[-–]?\s*\d+\s*[-–]?\s*\n", "\n", text)
|
| 54 |
+
text = re.sub(r"Page\s*\d+", "", text, flags=re.IGNORECASE)
|
| 55 |
+
return text
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def remove_headers_footers(text: str) -> str:
|
| 59 |
+
"""
|
| 60 |
+
Remove common repeating header/footer patterns.
|
| 61 |
+
Customize if needed.
|
| 62 |
+
"""
|
| 63 |
+
patterns = [
|
| 64 |
+
r"Copyright\s.*",
|
| 65 |
+
r"All rights reserved.*",
|
| 66 |
+
r"www\.[^\s]+",
|
| 67 |
+
r"http[s]?://[^\s]+",
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
for pattern in patterns:
|
| 71 |
+
text = re.sub(pattern, "", text, flags=re.IGNORECASE)
|
| 72 |
+
|
| 73 |
+
return text
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def remove_non_printable(text: str) -> str:
|
| 77 |
+
"""
|
| 78 |
+
Remove non-printable characters from PDF extraction.
|
| 79 |
+
"""
|
| 80 |
+
return "".join(ch for ch in text if ch.isprintable())
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def remove_duplicate_words(text: str) -> str:
|
| 84 |
+
words = text.split()
|
| 85 |
+
cleaned_words = []
|
| 86 |
+
prev_word = None
|
| 87 |
+
|
| 88 |
+
for word in words:
|
| 89 |
+
if word != prev_word:
|
| 90 |
+
cleaned_words.append(word)
|
| 91 |
+
prev_word = word
|
| 92 |
+
|
| 93 |
+
return " ".join(cleaned_words)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|