aniket47 commited on
Commit
01728c5
·
1 Parent(s): 8879e07

Deploy Document Intelligence Chatbot

Browse files

- Streamlit-based chatbot with PDF processing
- Local Hugging Face models (Flan-T5, Sentence Transformers)
- Smart query routing (documents vs web search)
- FAISS vector search for fast retrieval
- Optional web search integration

.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
 
2
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
3
  *.onnx filter=lfs diff=lfs merge=lfs -text
4
+ *.faiss filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # PyInstaller
26
+ *.manifest
27
+ *.spec
28
+
29
+ # Installer logs
30
+ pip-log.txt
31
+ pip-delete-this-directory.txt
32
+
33
+ # Unit test / coverage reports
34
+ htmlcov/
35
+ .tox/
36
+ .nox/
37
+ .coverage
38
+ .coverage.*
39
+ .cache
40
+ nosetests.xml
41
+ coverage.xml
42
+ *.cover
43
+ *.py,cover
44
+ .hypothesis/
45
+ .pytest_cache/
46
+ cover/
47
+
48
+ # Virtual environments
49
+ .env
50
+ .venv
51
+ env/
52
+ venv/
53
+ ENV/
54
+ env.bak/
55
+ venv.bak/
56
+
57
+ # Jupyter Notebook
58
+ .ipynb_checkpoints
59
+
60
+ # pyenv
61
+ .python-version
62
+
63
+ # Environments
64
+ .env
65
+ .env.local
66
+ .env.development.local
67
+ .env.test.local
68
+ .env.production.local
69
+
70
+ # Streamlit
71
+ .streamlit/
72
+
73
+ # FAISS index files
74
+ *.faiss
75
+ *_docs.pkl
76
+ vector_index*
77
+
78
+ # Downloaded models
79
+ models/
80
+ *.safetensors
81
+ *.bin
82
+ pytorch_model.bin
83
+ config.json
84
+ tokenizer.json
85
+ tokenizer_config.json
86
+ special_tokens_map.json
87
+ vocab.txt
88
+
89
+ # Temporary files
90
+ *.tmp
91
+ *.temp
92
+ .DS_Store
93
+ Thumbs.db
94
+
95
+ # IDE
96
+ .vscode/
97
+ .idea/
98
+ *.swp
99
+ *.swo
100
+
101
+ # Logs
102
+ *.log
103
+ logs/
104
+
.streamlit/config.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [server]
2
+ headless = true
3
+ port = 8501
4
+ enableCORS = false
5
+ enableXsrfProtection = false
6
+
7
+ [browser]
8
+ gatherUsageStats = false
9
+
10
+ [theme]
11
+ primaryColor = "#6366f1"
12
+ backgroundColor = "#0e1117"
13
+ secondaryBackgroundColor = "#262730"
14
+ textColor = "#fafafa"
15
+ font = "sans serif"
README.md CHANGED
@@ -1,20 +1,48 @@
1
  ---
2
- title: Document Intelligence Chatbot
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
  pinned: false
11
- short_description: Streamlit template space
12
  license: mit
13
  ---
14
 
15
- # Welcome to Streamlit!
16
 
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
 
 
18
 
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Universal Document Intelligence Chatbot
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.28.0
8
+ app_file: app.py
 
9
  pinned: false
 
10
  license: mit
11
  ---
12
 
13
+ # 📚 Universal Document Intelligence Chatbot
14
 
15
+ [![Streamlit](https://img.shields.io/badge/Streamlit-FF4B4B?style=for-the-badge&logo=streamlit&logoColor=white)](https://streamlit.io)
16
+ [![HuggingFace](https://img.shields.io/badge/🤗-Hugging%20Face-yellow?style=for-the-badge)](https://huggingface.co)
17
+ [![Python](https://img.shields.io/badge/Python-3.10-blue?style=for-the-badge&logo=python)](https://python.org)
18
 
19
+ A simple, private, and powerful chatbot that can answer your questions using both your own documents and the web.
20
+
21
+ ## ✨ Features
22
+
23
+ - **🔒 Runs locally**: Uses Hugging Face Transformers, so your data stays private
24
+ - **⚡ Quick search**: Finds answers fast with Sentence Transformers and FAISS
25
+ - **🧠 Smart routing**: Decides when to pull from your documents or from the web
26
+ - **📄 Handles PDFs**: Upload PDFs and ask questions directly
27
+ - **🌐 Stay up to date**: Can use web search for the latest information
28
+ - **📦 No setup hassle**: Downloads models automatically the first time you use them
29
+
30
+ ## 🚀 Quick Start
31
+
32
+ ### Local Setup
33
+
34
+ ```bash
35
+ # Create a virtual environment
36
+ python -m venv venv
37
+
38
+ # Activate it (Windows)
39
+ venv\Scripts\activate.bat
40
+
41
+ # Activate it (Mac/Linux)
42
+ source venv/bin/activate
43
+
44
+ # Install dependencies
45
+ pip install -r requirements.txt
46
+
47
+ # Launch the app
48
+ streamlit run app.py
app.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from typing import List, Dict
4
+ import time
5
+
6
+ # Import custom components
7
+ from components.document_processor import DocumentProcessor
8
+ from components.vector_store import VectorStore
9
+ from components.query_router import QueryRouter, QueryType
10
+ from components.web_search import WebSearcher
11
+ from components.huggingface_client import HuggingFaceClient
12
+
13
+ # Page configuration
14
+ st.set_page_config(
15
+ page_title="Universal Document Intelligence Chatbot",
16
+ layout="wide",
17
+ initial_sidebar_state="expanded"
18
+ )
19
+
20
+ @st.cache_resource
21
+ def get_hf_client():
22
+ """Get or create HuggingFace client with caching"""
23
+ try:
24
+ print("Initializing cached HuggingFace client...")
25
+ client = HuggingFaceClient()
26
+ # Force model loading
27
+ success = client._load_model()
28
+ print(f"Model loading success: {success}")
29
+ print(f"Model is_loaded: {client.is_loaded}")
30
+ return client, success
31
+ except Exception as e:
32
+ print(f"Failed to initialize HuggingFace client: {str(e)}")
33
+ return None, False
34
+
35
+ class DocumentChatbot:
36
+ """
37
+ Main chatbot application class
38
+ """
39
+
40
+ def __init__(self):
41
+ self.doc_processor = DocumentProcessor()
42
+ self.vector_store = VectorStore()
43
+ self.query_router = QueryRouter()
44
+ self.web_searcher = None
45
+
46
+ # Get cached HuggingFace client
47
+ self.hf_client, self.model_loaded = get_hf_client()
48
+
49
+ # Initialize web searcher if API key is available
50
+ try:
51
+ self.web_searcher = WebSearcher()
52
+ except ValueError as e:
53
+ st.warning(f"Web search disabled: {str(e)}")
54
+
55
+ # Load existing index if available
56
+ self.vector_store.load_index()
57
+
58
+ def is_ai_model_available(self):
59
+ """Check if AI model is available"""
60
+ return self.hf_client is not None and self.hf_client.is_loaded
61
+
62
+ def process_uploaded_files(self, uploaded_files):
63
+ """Process uploaded PDF files"""
64
+ if not uploaded_files:
65
+ return
66
+
67
+ with st.spinner("Processing uploaded documents..."):
68
+ all_chunks = []
69
+
70
+ for uploaded_file in uploaded_files:
71
+ try:
72
+ # Process the PDF
73
+ chunks = self.doc_processor.process_document(uploaded_file)
74
+ all_chunks.extend(chunks)
75
+
76
+ st.success(f"Processed {uploaded_file.name}: {len(chunks)} chunks")
77
+
78
+ except Exception as e:
79
+ st.error(f"Error processing {uploaded_file.name}: {str(e)}")
80
+
81
+ if all_chunks:
82
+ # Add to vector store
83
+ self.vector_store.add_documents(all_chunks)
84
+ self.vector_store.save_index()
85
+
86
+ st.success(f"Successfully processed {len(all_chunks)} document chunks!")
87
+
88
+ # Update session state
89
+ st.session_state.documents_loaded = True
90
+ st.session_state.vector_stats = self.vector_store.get_stats()
91
+
92
+ def search_documents(self, query: str, k: int = 5) -> List[Dict]:
93
+ """Search documents using vector similarity"""
94
+ if self.vector_store.index is None or len(self.vector_store.documents) == 0:
95
+ print(f"No documents available - index: {self.vector_store.index is not None}, docs: {len(self.vector_store.documents) if hasattr(self.vector_store, 'documents') else 'N/A'}")
96
+ return []
97
+
98
+ results = self.vector_store.search(query, k=k)
99
+ print(f"Document search for '{query}': found {len(results)} results")
100
+ if results:
101
+ scores = [r.get('score', 0) for r in results]
102
+ print(f"Score range: {min(scores):.3f} - {max(scores):.3f}")
103
+ return results
104
+
105
+ def get_web_search_results(self, query: str) -> List[Dict]:
106
+ """Get web search results"""
107
+ if not self.web_searcher:
108
+ return []
109
+
110
+ try:
111
+ return self.web_searcher.search_and_format(query, num_results=3)
112
+ except Exception as e:
113
+ st.error(f"Web search error: {str(e)}")
114
+ return []
115
+
116
+ def generate_response(self, query: str) -> Dict:
117
+ """Generate response using smart routing and HuggingFace for LLM responses"""
118
+ response = {
119
+ 'query': query,
120
+ 'sources': [],
121
+ 'answer': '',
122
+ 'routing_info': '',
123
+ 'search_strategy': 'unknown'
124
+ }
125
+
126
+ # Search documents first, but respect query routing
127
+ doc_results = self.search_documents(query)
128
+
129
+ # NEW: Use semantic-based routing instead of keyword-based
130
+ routing_analysis = self.query_router.analyze_query_semantic(query, self.vector_store, similarity_threshold=0.15)
131
+
132
+ print(f"DEBUG: Semantic routing result: {routing_analysis}")
133
+
134
+ # SMART ROUTING: Use semantic similarity to determine strategy
135
+ if routing_analysis['suggested_route'] == QueryType.WEB_SEARCH:
136
+ # Query is not relevant to documents - use web search
137
+ response['search_strategy'] = 'web_search'
138
+ response['routing_info'] = f"Strategy: web_search (reason: {routing_analysis['reasoning'][0] if routing_analysis['reasoning'] else 'semantic analysis'})"
139
+ print(f"DEBUG: Using web search for query: '{query}' (similarity: {routing_analysis.get('similarity_score', 0):.3f})")
140
+ web_results = self.get_web_search_results(query)
141
+ print(f"DEBUG: Web search returned {len(web_results) if web_results else 0} results")
142
+
143
+ if web_results:
144
+ # Create context from web results
145
+ context = "Web search results:\n"
146
+ for i, result in enumerate(web_results[:3], 1):
147
+ context += f"{i}. {result['title']}: {result['snippet']}\n"
148
+ response['sources'].append({
149
+ 'type': 'web',
150
+ 'title': result['title'],
151
+ 'snippet': result['snippet'],
152
+ 'link': result.get('link', ''),
153
+ 'source': result.get('source', '')
154
+ })
155
+
156
+ print(f"DEBUG: Web context created, length: {len(context)}")
157
+
158
+ # Generate response using HuggingFace
159
+ if self.is_ai_model_available():
160
+ system_prompt = "You are a helpful AI assistant that answers questions based on web search results. Be accurate and cite sources when appropriate."
161
+ ai_response = self.hf_client.generate_response(query, context, system_prompt)
162
+
163
+ if len(ai_response.strip()) < 50 or "not sure" in ai_response.lower():
164
+ response['answer'] = f"**🌐 Web Search Results:**\n{context}\n\n**🤖 AI Analysis:**\n{ai_response}"
165
+ else:
166
+ response['answer'] = f"**🤖 AI Analysis:**\n{ai_response}\n\n**🌐 Web Search Results:**\n{context}"
167
+ response['ai_model_used'] = True
168
+ else:
169
+ response['answer'] = f"**🌐 Web Search Results:**\n{context}"
170
+ response['ai_model_used'] = False
171
+
172
+ print(f"DEBUG: Returning web search response")
173
+ return response
174
+ else:
175
+ print("DEBUG: No web results, falling back to document search")
176
+
177
+ # If semantic routing suggests documents, use them
178
+ elif routing_analysis['suggested_route'] == QueryType.DOCUMENT_ONLY and doc_results and len(doc_results) > 0:
179
+ best_score = max([r.get('score', 0) for r in doc_results])
180
+
181
+ print(f"DEBUG: Using documents based on semantic routing: {len(doc_results)} results, best score: {best_score:.3f}")
182
+
183
+ response['search_strategy'] = 'document_search'
184
+ response['routing_info'] = f"Strategy: document_search (semantic similarity: {routing_analysis.get('similarity_score', 0):.3f}, found {len(doc_results)} matches)"
185
+
186
+ # Create context from document results
187
+ context = "Relevant information from your documents:\n"
188
+ for i, result in enumerate(doc_results[:3], 1):
189
+ doc = result['document']
190
+ score = result['score']
191
+ context += f"{i}. From {doc['metadata']['filename']} (relevance: {score:.2f}):\n{doc['text']}\n\n"
192
+
193
+ response['sources'].append({
194
+ 'type': 'document',
195
+ 'filename': doc['metadata']['filename'],
196
+ 'text': doc['text'],
197
+ 'score': score,
198
+ 'chunk_id': doc['metadata'].get('chunk_index', 0)
199
+ })
200
+
201
+ # Generate response using HuggingFace
202
+ if self.is_ai_model_available():
203
+ system_prompt = "You are a helpful AI assistant that answers questions based on provided document context. Be accurate and cite the source documents when appropriate."
204
+ print(f"DEBUG: Generating AI response for query: '{query[:50]}...'")
205
+ print(f"DEBUG: Context length: {len(context)}")
206
+ ai_response = self.hf_client.generate_response(query, context, system_prompt)
207
+ print(f"DEBUG: AI response received: '{ai_response[:100]}...'")
208
+ print(f"DEBUG: AI response length: {len(ai_response.strip())}")
209
+
210
+ # Always combine AI response with document context for better user experience
211
+ if ai_response and len(ai_response.strip()) > 5:
212
+ response['answer'] = f"**🤖 AI Summary:**\n{ai_response}\n\n**📄 Source Documents:**\n{context}"
213
+ response['ai_model_used'] = True
214
+ else:
215
+ # Fallback if AI response is empty
216
+ response['answer'] = f"**📄 Source Documents:**\n{context}"
217
+ response['ai_model_used'] = False
218
+ else:
219
+ print("DEBUG: AI model not available, using fallback")
220
+ # Fallback response if HuggingFace is not available
221
+ response['answer'] = f"**📄 Source Documents:**\n{context}"
222
+ response['ai_model_used'] = False
223
+
224
+ return response
225
+
226
+ # Fallback: Use web search if no relevant documents found
227
+ print("DEBUG: Using web search fallback")
228
+ response['search_strategy'] = 'web_search'
229
+ response['routing_info'] = f"Strategy: web_search (no relevant documents found or documents not relevant enough)"
230
+ web_results = self.get_web_search_results(query)
231
+
232
+ if web_results:
233
+ # Create context from web results
234
+ context = "Web search results:\n"
235
+ for i, result in enumerate(web_results[:3], 1):
236
+ context += f"{i}. {result['title']}: {result['snippet']}\n"
237
+ response['sources'].append({
238
+ 'type': 'web',
239
+ 'title': result['title'],
240
+ 'snippet': result['snippet'],
241
+ 'link': result.get('link', ''),
242
+ 'source': result.get('source', '')
243
+ })
244
+
245
+ # Generate response using HuggingFace
246
+ if self.is_ai_model_available():
247
+ system_prompt = "You are a helpful AI assistant. Answer the user's question based on the provided web search results. Be informative and cite your sources."
248
+ ai_response = self.hf_client.generate_response(query, context, system_prompt)
249
+
250
+ if len(ai_response.strip()) < 50 or "not sure" in ai_response.lower():
251
+ response['answer'] = f"**🌐 Web Search Results:**\n{context}\n\n**🤖 AI Analysis:**\n{ai_response}"
252
+ else:
253
+ response['answer'] = f"**🤖 AI Analysis:**\n{ai_response}\n\n**🌐 Web Search Results:**\n{context}"
254
+ response['ai_model_used'] = True
255
+ else:
256
+ response['answer'] = f"**🌐 Web Search Results:**\n{context}"
257
+ response['ai_model_used'] = False
258
+ else:
259
+ response['answer'] = "I couldn't find relevant information in your documents or through web search. Please try rephrasing your question or upload more relevant documents."
260
+
261
+ return response
262
+
263
+ def main():
264
+ """Main application function"""
265
+
266
+ # Initialize session state
267
+ if 'chatbot' not in st.session_state:
268
+ st.session_state.chatbot = DocumentChatbot()
269
+
270
+ if 'chat_history' not in st.session_state:
271
+ st.session_state.chat_history = []
272
+
273
+ if 'documents_loaded' not in st.session_state:
274
+ st.session_state.documents_loaded = False
275
+
276
+ # Header
277
+ st.title("Universal Document Intelligence Chatbot")
278
+ st.markdown("*Upload documents and ask questions - get answers from your files or the web*")
279
+
280
+ # Sidebar for document management
281
+ with st.sidebar:
282
+ st.header("Document Management")
283
+
284
+ # File upload
285
+ uploaded_files = st.file_uploader(
286
+ "Upload PDF documents",
287
+ type=['pdf'],
288
+ accept_multiple_files=True,
289
+ help="Upload PDF files to create a knowledge base"
290
+ )
291
+
292
+ # Process uploaded files
293
+ if uploaded_files:
294
+ if st.button("Process Documents", type="primary"):
295
+ st.session_state.chatbot.process_uploaded_files(uploaded_files)
296
+
297
+ # Display statistics
298
+ if st.session_state.documents_loaded:
299
+ st.subheader("Knowledge Base Stats")
300
+ stats = st.session_state.chatbot.vector_store.get_stats()
301
+ st.metric("Documents", stats['total_documents'])
302
+ st.metric("Vector Dimension", stats['dimension'])
303
+ st.info(f"Model: {stats['model_name']}")
304
+
305
+ # Clear documents
306
+ if st.session_state.documents_loaded:
307
+ if st.button("Clear All Documents", type="secondary"):
308
+ st.session_state.chatbot.vector_store.clear_index()
309
+ st.session_state.documents_loaded = False
310
+ st.session_state.chat_history = []
311
+ st.success("Documents cleared!")
312
+ st.rerun()
313
+
314
+ # Web search status
315
+ st.subheader("AI Model Status")
316
+ if st.session_state.chatbot.hf_client and st.session_state.chatbot.hf_client.is_available():
317
+ st.success("✅ AI model loaded")
318
+ else:
319
+ st.error("❌ AI model not loaded")
320
+ if st.button("🔄 Load AI Model", type="primary"):
321
+ success = st.session_state.chatbot.initialize_ai_model()
322
+ if success:
323
+ st.rerun()
324
+
325
+ st.subheader("Web Search")
326
+ if st.session_state.chatbot.web_searcher:
327
+ st.success("Web search enabled")
328
+ else:
329
+ st.error("Web search disabled")
330
+ st.info("Add SERPER_API_KEY to .env file to enable web search")
331
+
332
+ # Main chat interface
333
+ st.header("Chat Interface")
334
+
335
+ # Display chat history
336
+ for i, chat in enumerate(st.session_state.chat_history):
337
+ with st.chat_message("user"):
338
+ st.write(chat['query'])
339
+
340
+ with st.chat_message("assistant"):
341
+ st.write(chat['answer'])
342
+
343
+ # Show routing info
344
+ if chat.get('routing_info'):
345
+ with st.expander("Search Strategy"):
346
+ st.info(chat['routing_info'])
347
+
348
+ # Show sources
349
+ if chat.get('sources'):
350
+ with st.expander(f"Sources ({len(chat['sources'])} found)"):
351
+ for j, source in enumerate(chat['sources'], 1):
352
+ if source['type'] == 'document':
353
+ st.markdown(f"**{j}. Document Source:**")
354
+ st.markdown(f"- **File:** {source['filename']}")
355
+ st.markdown(f"- **Relevance:** {source['score']:.2f}")
356
+ st.markdown(f"- **Text:** {source['text'][:200]}...")
357
+ elif source['type'] == 'web':
358
+ st.markdown(f"**{j}. Web Source:**")
359
+ st.markdown(f"- **Title:** {source['title']}")
360
+ st.markdown(f"- **Source:** {source.get('source', 'Unknown')}")
361
+ if source.get('link'):
362
+ st.markdown(f"- **Link:** {source['link']}")
363
+
364
+ # Query input
365
+ query = st.chat_input("Ask a question about your documents or anything else...")
366
+
367
+ if query:
368
+ # Add user message to chat
369
+ with st.chat_message("user"):
370
+ st.write(query)
371
+
372
+ # Generate response
373
+ with st.chat_message("assistant"):
374
+ with st.spinner("Thinking..."):
375
+ response = st.session_state.chatbot.generate_response(query)
376
+
377
+ st.write(response['answer'])
378
+
379
+ # Show routing info
380
+ if response.get('routing_info'):
381
+ with st.expander("Search Strategy"):
382
+ st.info(response['routing_info'])
383
+ st.caption(f"Strategy used: {response['search_strategy']}")
384
+
385
+ # Show sources
386
+ if response.get('sources'):
387
+ with st.expander(f"Sources ({len(response['sources'])} found)"):
388
+ for j, source in enumerate(response['sources'], 1):
389
+ if source['type'] == 'document':
390
+ st.markdown(f"**{j}. Document Source:**")
391
+ st.markdown(f"- **File:** {source['filename']}")
392
+ st.markdown(f"- **Relevance:** {source['score']:.2f}")
393
+ st.markdown(f"- **Text:** {source['text'][:200]}...")
394
+ elif source['type'] == 'web':
395
+ st.markdown(f"**{j}. Web Source:**")
396
+ st.markdown(f"- **Title:** {source['title']}")
397
+ st.markdown(f"- **Source:** {source.get('source', 'Unknown')}")
398
+ if source.get('link'):
399
+ st.markdown(f"- **Link:** {source['link']}")
400
+
401
+ # Add to chat history
402
+ st.session_state.chat_history.append({
403
+ 'query': query,
404
+ 'answer': response['answer'],
405
+ 'routing_info': response.get('routing_info'),
406
+ 'sources': response.get('sources', []),
407
+ 'search_strategy': response.get('search_strategy')
408
+ })
409
+
410
+ # Instructions
411
+ if not st.session_state.chat_history:
412
+ st.markdown("""
413
+ ### Getting Started:
414
+
415
+ 1. **Upload PDFs** - Use the sidebar to add your documents
416
+ 2. **Click Process** - This creates a searchable knowledge base
417
+ 3. **Start Chatting** - Ask questions in the box below
418
+
419
+ ### What you can ask:
420
+
421
+ **About your documents:**
422
+ - "What does the report say about..."
423
+ - "Summarize the main points"
424
+ - "Find information about X"
425
+
426
+ **General questions:**
427
+ - "What's the latest news on..."
428
+ - "How does X work?"
429
+ - "Compare A and B"
430
+
431
+ The chatbot automatically decides whether to search your documents or the web.
432
+ """)
433
+
434
+ if __name__ == "__main__":
435
+ main()
components/document_processor.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import re
3
+ from typing import List, Dict
4
+ import io
5
+
6
+ class DocumentProcessor:
7
+ """
8
+ Handles PDF document processing and intelligent text chunking
9
+ """
10
+
11
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
12
+ self.chunk_size = chunk_size
13
+ self.chunk_overlap = chunk_overlap
14
+
15
+ def extract_text_from_pdf(self, pdf_file) -> Dict[str, any]:
16
+ """
17
+ Extract text from PDF file and preserve metadata
18
+
19
+ Args:
20
+ pdf_file: Uploaded PDF file object
21
+
22
+ Returns:
23
+ Dict containing extracted text, metadata, and page information
24
+ """
25
+ try:
26
+ # Read PDF using PyPDF2
27
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
28
+
29
+ # Extract metadata
30
+ metadata = {
31
+ 'filename': pdf_file.name,
32
+ 'num_pages': len(pdf_reader.pages),
33
+ 'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
34
+ 'author': pdf_reader.metadata.get('/Author', '') if pdf_reader.metadata else '',
35
+ 'subject': pdf_reader.metadata.get('/Subject', '') if pdf_reader.metadata else ''
36
+ }
37
+
38
+ # Extract text from each page
39
+ pages_text = []
40
+ full_text = ""
41
+
42
+ for page_num, page in enumerate(pdf_reader.pages):
43
+ try:
44
+ page_text = page.extract_text()
45
+ if page_text.strip(): # Only add non-empty pages
46
+ pages_text.append({
47
+ 'page_number': page_num + 1,
48
+ 'text': page_text.strip()
49
+ })
50
+ full_text += f"\n\n[Page {page_num + 1}]\n{page_text.strip()}"
51
+ except Exception as e:
52
+ print(f"Error extracting text from page {page_num + 1}: {str(e)}")
53
+ continue
54
+
55
+ return {
56
+ 'full_text': full_text.strip(),
57
+ 'pages': pages_text,
58
+ 'metadata': metadata
59
+ }
60
+
61
+ except Exception as e:
62
+ raise Exception(f"Error processing PDF: {str(e)}")
63
+
64
+ def clean_text(self, text: str) -> str:
65
+ """
66
+ Clean and normalize extracted text
67
+
68
+ Args:
69
+ text: Raw extracted text
70
+
71
+ Returns:
72
+ Cleaned text
73
+ """
74
+ # Remove excessive whitespace
75
+ text = re.sub(r'\s+', ' ', text)
76
+
77
+ # Remove special characters but keep punctuation
78
+ text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)\[\]\"\'\/]', '', text)
79
+
80
+ # Fix common PDF extraction issues
81
+ text = text.replace('�', '') # Remove replacement characters
82
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Add space between words
83
+
84
+ return text.strip()
85
+
86
+ def chunk_text(self, text: str, metadata: Dict) -> List[Dict]:
87
+ """
88
+ Split text into overlapping chunks for better retrieval
89
+
90
+ Args:
91
+ text: Full document text
92
+ metadata: Document metadata
93
+
94
+ Returns:
95
+ List of text chunks with metadata
96
+ """
97
+ # Clean the text first
98
+ cleaned_text = self.clean_text(text)
99
+
100
+ # Split into sentences for better chunking
101
+ sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
102
+
103
+ chunks = []
104
+ current_chunk = ""
105
+ current_length = 0
106
+ chunk_id = 0
107
+
108
+ for sentence in sentences:
109
+ sentence_length = len(sentence)
110
+
111
+ # If adding this sentence would exceed chunk size, save current chunk
112
+ if current_length + sentence_length > self.chunk_size and current_chunk:
113
+ chunks.append({
114
+ 'chunk_id': chunk_id,
115
+ 'text': current_chunk.strip(),
116
+ 'metadata': {
117
+ **metadata,
118
+ 'chunk_size': len(current_chunk),
119
+ 'chunk_index': chunk_id
120
+ }
121
+ })
122
+ chunk_id += 1
123
+
124
+ # Start new chunk with overlap
125
+ if self.chunk_overlap > 0:
126
+ # Take last few sentences for overlap
127
+ overlap_sentences = current_chunk.split('. ')[-2:]
128
+ current_chunk = '. '.join(overlap_sentences) + '. ' + sentence
129
+ current_length = len(current_chunk)
130
+ else:
131
+ current_chunk = sentence
132
+ current_length = sentence_length
133
+ else:
134
+ # Add sentence to current chunk
135
+ if current_chunk:
136
+ current_chunk += " " + sentence
137
+ else:
138
+ current_chunk = sentence
139
+ current_length += sentence_length
140
+
141
+ # Add the last chunk if it exists
142
+ if current_chunk.strip():
143
+ chunks.append({
144
+ 'chunk_id': chunk_id,
145
+ 'text': current_chunk.strip(),
146
+ 'metadata': {
147
+ **metadata,
148
+ 'chunk_size': len(current_chunk),
149
+ 'chunk_index': chunk_id
150
+ }
151
+ })
152
+
153
+ return chunks
154
+
155
+ def process_document(self, pdf_file) -> List[Dict]:
156
+ """
157
+ Complete document processing pipeline
158
+
159
+ Args:
160
+ pdf_file: Uploaded PDF file
161
+
162
+ Returns:
163
+ List of processed text chunks with metadata
164
+ """
165
+ # Extract text and metadata
166
+ doc_data = self.extract_text_from_pdf(pdf_file)
167
+
168
+ # Create chunks
169
+ chunks = self.chunk_text(doc_data['full_text'], doc_data['metadata'])
170
+
171
+ return chunks
components/huggingface_client.py ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local Hugging Face model integration with automatic model downloading
3
+ """
4
+
5
+ import os
6
+ import torch
7
+ from typing import List, Dict, Optional
8
+ import config
9
+ import warnings
10
+
11
+ # Suppress some warnings for cleaner output
12
+ warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
13
+
14
+ class HuggingFaceClient:
15
+ """
16
+ Client for local Hugging Face models with automatic downloading
17
+ """
18
+
19
+ def __init__(self, model_name: str = None, cache_dir: str = None):
20
+ self.model_name = model_name or config.CHAT_MODEL
21
+ self.cache_dir = cache_dir or config.MODEL_CACHE_DIR
22
+ self.max_length = config.MODEL_MAX_LENGTH
23
+ self.temperature = config.TEMPERATURE
24
+
25
+ # Create cache directory if it doesn't exist
26
+ os.makedirs(self.cache_dir, exist_ok=True)
27
+
28
+ # Initialize device
29
+ self.device = self._setup_device()
30
+
31
+ # Initialize models (will be loaded on first use)
32
+ self.tokenizer = None
33
+ self.model = None
34
+ self.model_type = None # Will be set during loading
35
+ self.is_loaded = False
36
+
37
+ print(f"HuggingFace Client initialized")
38
+ print(f"Model: {self.model_name}")
39
+ print(f"Cache: {self.cache_dir}")
40
+ print(f"Device: {self.device}")
41
+
42
+ def _setup_device(self):
43
+ """Setup computation device (CPU/GPU)"""
44
+ if config.DEVICE == "auto":
45
+ if config.USE_CUDA and torch.cuda.is_available():
46
+ device = "cuda"
47
+ print(f"Using GPU: {torch.cuda.get_device_name()}")
48
+ else:
49
+ device = "cpu"
50
+ print("Using CPU")
51
+ else:
52
+ device = config.DEVICE
53
+
54
+ return device
55
+
56
+ def _load_model(self):
57
+ """Load the model and tokenizer (downloads automatically if not cached)"""
58
+ if self.is_loaded:
59
+ return True
60
+
61
+ try:
62
+ print(f"Loading model: {self.model_name}")
63
+ print("This might take a few minutes on first run (downloading model)...")
64
+
65
+ # Import here to avoid slow startup if not needed
66
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
67
+
68
+ # Load tokenizer
69
+ self.tokenizer = AutoTokenizer.from_pretrained(
70
+ self.model_name,
71
+ cache_dir=self.cache_dir
72
+ )
73
+
74
+ # Determine model type and load accordingly
75
+ is_t5_model = "t5" in self.model_name.lower() or "flan" in self.model_name.lower()
76
+
77
+ if is_t5_model:
78
+ print("Loading T5/FLAN model for text-to-text generation...")
79
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(
80
+ self.model_name,
81
+ cache_dir=self.cache_dir,
82
+ torch_dtype=torch.float32, # T5 works better with float32
83
+ low_cpu_mem_usage=True,
84
+ trust_remote_code=True
85
+ )
86
+ self.model_type = "seq2seq"
87
+ print("T5/FLAN model loaded successfully!")
88
+ else:
89
+ print("Loading causal language model...")
90
+ self.model = AutoModelForCausalLM.from_pretrained(
91
+ self.model_name,
92
+ cache_dir=self.cache_dir,
93
+ torch_dtype=torch.float32,
94
+ low_cpu_mem_usage=True,
95
+ trust_remote_code=True
96
+ )
97
+ self.model_type = "causal"
98
+
99
+ # Add pad token for causal models
100
+ if self.tokenizer.pad_token is None:
101
+ self.tokenizer.pad_token = self.tokenizer.eos_token
102
+ print("Causal model loaded successfully!")
103
+
104
+ self.model.eval() # Set to evaluation mode
105
+ self.is_loaded = True
106
+
107
+ print(f"Model size: ~{self._get_model_size_mb():.1f} MB")
108
+ return True
109
+
110
+ except Exception as e:
111
+ print(f"Error loading model: {str(e)}")
112
+ print("Model will run in offline mode - document search will still work!")
113
+ self.is_loaded = False
114
+ return False
115
+
116
+ def _get_model_size_mb(self):
117
+ """Estimate model size in MB"""
118
+ if self.model is None:
119
+ return 0
120
+
121
+ param_size = 0
122
+ for param in self.model.parameters():
123
+ param_size += param.nelement() * param.element_size()
124
+
125
+ return param_size / 1024 / 1024
126
+
127
+ def generate_response(self, query: str, context: str = "", system_prompt: str = "") -> str:
128
+ """Generate a response given a query and context with offline fallback"""
129
+ # Load model on first use
130
+ if not self.is_loaded:
131
+ success = self._load_model()
132
+ if not success:
133
+ # Return offline fallback response
134
+ return self._generate_offline_response(query, context)
135
+
136
+ try:
137
+ # Prepare the input text based on model type
138
+ if hasattr(self, 'model_type') and self.model_type == "seq2seq":
139
+ # T5/FLAN models work better with instruction-style prompts
140
+ if context:
141
+ # For document-based questions
142
+ context_truncated = context[:800] if len(context) > 800 else context
143
+
144
+ if any(word in query.lower() for word in ['summarize', 'summary', 'main points', 'key points', 'overview']):
145
+ input_text = f"Summarize the following text: {context_truncated}"
146
+ else:
147
+ input_text = f"Answer the question based on the context.\nContext: {context_truncated}\nQuestion: {query}\nAnswer:"
148
+ else:
149
+ input_text = f"Answer this question: {query}"
150
+
151
+ # Tokenize for T5
152
+ input_ids = self.tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512)
153
+
154
+ # Ensure input_ids are on the same device as the model
155
+ if hasattr(self.model, 'device'):
156
+ model_device = next(self.model.parameters()).device
157
+ input_ids = input_ids.to(model_device)
158
+ else:
159
+ input_ids = input_ids.to(self.device)
160
+
161
+ # Generate with T5/FLAN
162
+ with torch.no_grad():
163
+ outputs = self.model.generate(
164
+ input_ids,
165
+ max_length=200, # Good length for summaries
166
+ min_length=20, # Ensure substantial response
167
+ temperature=0.7,
168
+ do_sample=True,
169
+ pad_token_id=self.tokenizer.pad_token_id,
170
+ eos_token_id=self.tokenizer.eos_token_id,
171
+ num_return_sequences=1,
172
+ no_repeat_ngram_size=3,
173
+ length_penalty=1.0
174
+ )
175
+
176
+ # Decode T5 response (T5 outputs only the generated text)
177
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
178
+
179
+ else:
180
+ # Original logic for causal models (DialoGPT, etc.)
181
+ if context:
182
+ context_truncated = context[:500] if len(context) > 500 else context
183
+
184
+ if any(word in query.lower() for word in ['summarize', 'summary', 'main points', 'key points', 'overview']):
185
+ input_text = f"Summarize this: {context_truncated}\nSummary:"
186
+ else:
187
+ input_text = f"Context: {context_truncated}\nQuestion: {query}\nAnswer:"
188
+ else:
189
+ input_text = f"Question: {query}\nAnswer:"
190
+
191
+ # Tokenize input with simpler approach
192
+ input_ids = self.tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=300)
193
+
194
+ # Ensure input_ids are on the same device as the model
195
+ if hasattr(self.model, 'device'):
196
+ model_device = next(self.model.parameters()).device
197
+ input_ids = input_ids.to(model_device)
198
+ else:
199
+ input_ids = input_ids.to(self.device)
200
+
201
+ # Generate response with causal model
202
+ with torch.no_grad():
203
+ outputs = self.model.generate(
204
+ input_ids,
205
+ max_length=input_ids.shape[1] + 100,
206
+ min_length=input_ids.shape[1] + 5,
207
+ temperature=0.8,
208
+ do_sample=True,
209
+ pad_token_id=self.tokenizer.eos_token_id,
210
+ eos_token_id=self.tokenizer.eos_token_id,
211
+ num_return_sequences=1,
212
+ no_repeat_ngram_size=2,
213
+ repetition_penalty=1.1,
214
+ length_penalty=1.0
215
+ )
216
+
217
+ # Decode causal model response
218
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
219
+
220
+ # Extract only the new generated text for causal models
221
+ if response.startswith(input_text):
222
+ response = response[len(input_text):].strip()
223
+ else:
224
+ # Fallback: try to find the answer part
225
+ for separator in ["Answer:", "Summary:", "\nBot:", "\n"]:
226
+ if separator in response:
227
+ parts = response.split(separator)
228
+ if len(parts) > 1:
229
+ response = parts[-1].strip()
230
+ break
231
+
232
+ print(f"Extracted response: '{response[:100]}...'")
233
+
234
+ # Clean up the response
235
+ cleaned_response = self._clean_response(response)
236
+
237
+ # Debug logging
238
+ print(f"Raw AI response length: {len(response)}")
239
+ print(f"Cleaned AI response length: {len(cleaned_response)}")
240
+ print(f"Cleaned response: '{cleaned_response[:100]}...'")
241
+
242
+ # Be more lenient - if we have any response, use it
243
+ if cleaned_response and len(cleaned_response.strip()) > 0:
244
+ return cleaned_response
245
+ elif response and len(response.strip()) > 0:
246
+ # Use raw response if cleaning removed too much
247
+ return response.strip()
248
+ else:
249
+ # Try a simple fallback generation
250
+ print("Attempting fallback generation with simpler prompt...")
251
+ return self._try_simple_generation(query, context)
252
+
253
+ except Exception as e:
254
+ print(f"Error generating response: {str(e)}")
255
+ # Fall back to offline response
256
+ return self._generate_offline_response(query, context)
257
+
258
+ def _try_simple_generation(self, query: str, context: str = "") -> str:
259
+ """Try a very simple generation as last resort"""
260
+ try:
261
+ # Ultra-simple prompt
262
+ simple_prompt = f"{query}"
263
+ input_ids = self.tokenizer.encode(simple_prompt, return_tensors="pt", max_length=50)
264
+
265
+ # Ensure input_ids are on the same device as the model
266
+ if hasattr(self.model, 'device'):
267
+ model_device = next(self.model.parameters()).device
268
+ input_ids = input_ids.to(model_device)
269
+ else:
270
+ input_ids = input_ids.to(self.device)
271
+
272
+ with torch.no_grad():
273
+ outputs = self.model.generate(
274
+ input_ids,
275
+ max_length=input_ids.shape[1] + 30,
276
+ temperature=0.9,
277
+ do_sample=True,
278
+ pad_token_id=self.tokenizer.eos_token_id,
279
+ num_return_sequences=1
280
+ )
281
+
282
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
283
+ response = response[len(simple_prompt):].strip()
284
+
285
+ if response and len(response) > 2:
286
+ return f"AI Response: {response}"
287
+
288
+ except Exception as e:
289
+ print(f"Simple generation also failed: {e}")
290
+
291
+ return self._generate_offline_response(query, context)
292
+
293
+ def _generate_offline_response(self, query: str, context: str = "") -> str:
294
+ """Generate a structured response when AI model is unavailable or gives poor response"""
295
+ # Check if this is being called because model is unavailable or just poor response
296
+ model_available = self.is_loaded
297
+ note_suffix = "*Note: AI model generated poor response - showing raw content*" if model_available else "*Note: AI model unavailable - showing raw content*"
298
+
299
+ if context:
300
+ if "Relevant information from your documents:" in context:
301
+ # Extract and format document content
302
+ lines = context.split('\n')
303
+ document_info = []
304
+ current_info = ""
305
+
306
+ for line in lines:
307
+ line = line.strip()
308
+ if line.startswith("From ") and "relevance:" in line:
309
+ if current_info:
310
+ document_info.append(current_info)
311
+ # Extract filename
312
+ filename = line.split("(relevance:")[0].replace("From ", "").strip()
313
+ current_info = f"**From {filename}:**"
314
+ elif line and not line.startswith("Relevant information") and len(line) > 10:
315
+ current_info += f"\n{line}"
316
+
317
+ if current_info:
318
+ document_info.append(current_info)
319
+
320
+ if document_info:
321
+ response = "Based on your uploaded documents:\n\n"
322
+ for info in document_info[:2]: # Show top 2 sources
323
+ response += f"{info}\n\n"
324
+ response += f"\n{note_suffix}"
325
+ return response
326
+
327
+ elif "Web search results:" in context:
328
+ # Format web search results
329
+ lines = context.split('\n')
330
+ search_results = []
331
+
332
+ for line in lines:
333
+ if line.strip() and not line.startswith('Web search results:'):
334
+ search_results.append(line.strip())
335
+
336
+ if search_results:
337
+ response = "Based on web search results:\n\n"
338
+ for i, result in enumerate(search_results[:3], 1):
339
+ response += f"{i}. {result}\n"
340
+ response += f"\n{note_suffix}"
341
+ return response
342
+
343
+ # No context or fallback case
344
+ if model_available:
345
+ return (f"I received your question: '{query}'\n\n"
346
+ f"I'm having trouble generating a good response right now. "
347
+ f"This might be due to the complexity of the question or model limitations.\n\n"
348
+ f"Try:\n"
349
+ f"• Rephrasing your question more simply\n"
350
+ f"• Being more specific about what you want to know\n"
351
+ f"• Uploading relevant documents for better context")
352
+ else:
353
+ return (f"I received your question: '{query}'\n\n"
354
+ f"Unfortunately, I cannot provide a detailed answer because:\n"
355
+ f"• The AI model failed to load (likely network connectivity issue)\n"
356
+ f"• This appears to be a connection problem with huggingface.co\n\n"
357
+ f"To resolve this:\n"
358
+ f"• Check your internet connection\n"
359
+ f"• Try again in a few minutes\n"
360
+ f"• Consider using a VPN if there are regional restrictions\n\n"
361
+ f"The app can still search your documents - try uploading PDFs and asking questions about them!")
362
+
363
+ def _clean_response(self, response: str) -> str:
364
+ """Clean up the generated response"""
365
+ # Remove common artifacts
366
+ response = response.strip()
367
+
368
+ # Stop at certain tokens that indicate end of response
369
+ stop_tokens = ["\nUser:", "\nBot:", "Question:", "Context:", "Answer:", "<|endoftext|>"]
370
+ for token in stop_tokens:
371
+ if token in response:
372
+ response = response.split(token)[0]
373
+
374
+ # Remove repetitive patterns (but be more lenient)
375
+ lines = response.split('\n')
376
+ if len(lines) > 1:
377
+ unique_lines = []
378
+ for line in lines:
379
+ line = line.strip()
380
+ if line and line not in unique_lines:
381
+ unique_lines.append(line)
382
+ response = ' '.join(unique_lines)
383
+
384
+ # Only remove if response is very short (reduced threshold)
385
+ if len(response.strip()) < 3:
386
+ return ""
387
+
388
+ return response.strip()
389
+
390
+ def is_available(self) -> bool:
391
+ """Check if the model is available for use"""
392
+ try:
393
+ if not self.is_loaded:
394
+ success = self._load_model()
395
+ return success
396
+ return self.is_loaded
397
+ except Exception as e:
398
+ print(f"Error checking model availability: {str(e)}")
399
+ return False
400
+
401
+ def get_model_info(self) -> Dict:
402
+ """Get information about the loaded model"""
403
+ return {
404
+ "model_name": self.model_name,
405
+ "device": self.device,
406
+ "is_loaded": self.is_loaded,
407
+ "cache_dir": self.cache_dir,
408
+ "size_mb": self._get_model_size_mb() if self.is_loaded else 0
409
+ }
410
+
411
+
412
+ class HuggingFaceEmbeddingModel:
413
+ """
414
+ Embedding model using Sentence Transformers with automatic downloading
415
+ """
416
+
417
+ def __init__(self, model_name: str = None, cache_dir: str = None):
418
+ self.model_name = model_name or config.EMBEDDING_MODEL
419
+ self.cache_dir = cache_dir or config.MODEL_CACHE_DIR
420
+ self.model = None
421
+ self.device = self._setup_device()
422
+
423
+ # Create cache directory
424
+ os.makedirs(self.cache_dir, exist_ok=True)
425
+
426
+ print(f"Embedding model: {self.model_name}")
427
+
428
+ def _setup_device(self):
429
+ """Setup computation device"""
430
+ if config.USE_CUDA and torch.cuda.is_available():
431
+ return "cuda"
432
+ return "cpu"
433
+
434
+ def _load_model(self):
435
+ """Load the sentence transformer model"""
436
+ if self.model is not None:
437
+ return
438
+
439
+ try:
440
+ print(f"Loading embedding model: {self.model_name}")
441
+ from sentence_transformers import SentenceTransformer
442
+
443
+ # Load with explicit device=None to let the library handle device assignment
444
+ self.model = SentenceTransformer(
445
+ self.model_name,
446
+ cache_folder=self.cache_dir,
447
+ device=None, # Let the library choose the best device
448
+ trust_remote_code=True
449
+ )
450
+
451
+ print(f"Embedding model loaded successfully!")
452
+ except Exception as e:
453
+ print(f"Error loading embedding model: {str(e)}")
454
+ raise e
455
+
456
+ def encode(self, texts: List[str]) -> torch.Tensor:
457
+ """Encode texts to embeddings"""
458
+ if self.model is None:
459
+ self._load_model()
460
+
461
+ try:
462
+ embeddings = self.model.encode(texts, convert_to_tensor=True)
463
+ return embeddings.cpu().numpy()
464
+ except Exception as e:
465
+ print(f"Error encoding texts: {str(e)}")
466
+ # Return dummy embeddings as fallback
467
+ import numpy as np
468
+ return np.random.rand(len(texts), 384).astype('float32')
469
+
470
+ def get_dimension(self) -> int:
471
+ """Get embedding dimension"""
472
+ if self.model is None:
473
+ self._load_model()
474
+
475
+ # Test with sample text
476
+ sample_embedding = self.encode(["sample text"])
477
+ return sample_embedding.shape[1]
478
+
479
+ def is_available(self) -> bool:
480
+ """Check if embedding model is available"""
481
+ try:
482
+ if self.model is None:
483
+ self._load_model()
484
+ return self.model is not None
485
+ except:
486
+ return False
components/query_router.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List, Tuple, Optional
3
+ from enum import Enum
4
+
5
+ class QueryType(Enum):
6
+ DOCUMENT_ONLY = "document_only"
7
+ WEB_SEARCH = "web_search"
8
+ HYBRID = "hybrid"
9
+
10
+ class QueryRouter:
11
+ """
12
+ Smart query routing logic to determine whether to use document search,
13
+ web search, or both based on query characteristics
14
+ """
15
+
16
+ def __init__(self):
17
+ # Keywords that trigger web search
18
+ self.web_search_keywords = {
19
+ 'temporal': [
20
+ 'latest', 'recent', 'current', 'now', 'today', 'this year',
21
+ '2024', '2025', 'new', 'updated', 'modern', 'contemporary'
22
+ ],
23
+ 'explanatory': [
24
+ 'explain', 'how does', 'how to', 'what is', 'what are',
25
+ 'why does', 'why is', 'tell me about', 'describe'
26
+ ],
27
+ 'comparative': [
28
+ 'vs', 'versus', 'compare', 'comparison', 'difference between',
29
+ 'alternatives to', 'better than', 'similar to', 'like'
30
+ ],
31
+ 'current_data': [
32
+ 'price', 'cost', 'stock', 'trend', 'trending', 'popular',
33
+ 'market', 'value', 'rate', 'statistics', 'data'
34
+ ],
35
+ 'specifications': [
36
+ 'specs', 'specifications', 'features', 'details', 'technical',
37
+ 'performance', 'benchmark', 'review'
38
+ ],
39
+ 'superlatives': [
40
+ 'slowest', 'biggest', 'smallest', 'best', 'worst',
41
+ 'most', 'least', 'highest', 'lowest', 'top', 'bottom',
42
+ 'largest', 'tallest', 'strongest', 'weakest'
43
+ ],
44
+ 'factual_queries': [
45
+ 'world record', 'world', 'global', 'worldwide', 'international',
46
+ 'country', 'countries', 'nation', 'capital', 'population'
47
+ ]
48
+ }
49
+
50
+ # Keywords that strongly suggest document search
51
+ self.document_keywords = [
52
+ 'according to', 'in the document', 'from the file', 'mentioned',
53
+ 'stated', 'written', 'document says', 'file contains',
54
+ 'pdf', 'pdf about', 'this pdf', 'document about', 'file about',
55
+ 'resume', 'cv', 'uploaded', 'this document', 'this file'
56
+ ]
57
+
58
+ # General knowledge keywords that might need web search
59
+ self.general_knowledge_keywords = [
60
+ 'definition', 'meaning', 'concept', 'theory', 'principle',
61
+ 'history', 'background', 'overview', 'introduction'
62
+ ]
63
+
64
+ def analyze_query(self, query: str) -> Dict:
65
+ """
66
+ Analyze query to determine routing strategy
67
+
68
+ Args:
69
+ query: User query string
70
+
71
+ Returns:
72
+ Dictionary with routing analysis
73
+ """
74
+ query_lower = query.lower()
75
+
76
+ # Initialize analysis
77
+ analysis = {
78
+ 'query': query,
79
+ 'web_indicators': [],
80
+ 'document_indicators': [],
81
+ 'confidence_scores': {
82
+ 'web_search': 0.0,
83
+ 'document_search': 0.0
84
+ },
85
+ 'suggested_route': QueryType.DOCUMENT_ONLY,
86
+ 'reasoning': []
87
+ }
88
+
89
+ # Check for web search indicators
90
+ web_score = 0
91
+ for category, keywords in self.web_search_keywords.items():
92
+ for keyword in keywords:
93
+ if keyword in query_lower:
94
+ analysis['web_indicators'].append(f"{keyword} ({category})")
95
+ web_score += self._get_keyword_weight(category)
96
+
97
+ # Check for document indicators
98
+ doc_score = 0
99
+ for keyword in self.document_keywords:
100
+ if keyword in query_lower:
101
+ analysis['document_indicators'].append(keyword)
102
+ doc_score += 2.0 # High weight for explicit document references
103
+
104
+ # Check for general knowledge that might need web search
105
+ for keyword in self.general_knowledge_keywords:
106
+ if keyword in query_lower:
107
+ analysis['web_indicators'].append(f"{keyword} (general_knowledge)")
108
+ web_score += 0.5
109
+
110
+ # Question word analysis
111
+ question_words = ['how', 'what', 'why', 'when', 'where', 'who', 'which']
112
+ question_count = sum(1 for word in question_words if word in query_lower.split())
113
+ if question_count > 0:
114
+ web_score += 0.3 * question_count
115
+
116
+ # Length analysis (longer queries often need more context)
117
+ if len(query.split()) > 10:
118
+ web_score += 0.2
119
+
120
+ # Normalize scores
121
+ max_possible_score = 10.0
122
+ analysis['confidence_scores']['web_search'] = min(web_score / max_possible_score, 1.0)
123
+ analysis['confidence_scores']['document_search'] = min(doc_score / max_possible_score, 1.0)
124
+
125
+ # If no explicit document indicators, boost document search slightly
126
+ if doc_score == 0:
127
+ analysis['confidence_scores']['document_search'] = 0.3
128
+
129
+ # Determine routing strategy
130
+ web_confidence = analysis['confidence_scores']['web_search']
131
+ doc_confidence = analysis['confidence_scores']['document_search']
132
+
133
+ if doc_confidence > 0.7: # Strong document indicators
134
+ analysis['suggested_route'] = QueryType.DOCUMENT_ONLY
135
+ analysis['reasoning'].append("Strong document reference indicators")
136
+ elif web_confidence > 0.35: # Even lower threshold for web search
137
+ analysis['suggested_route'] = QueryType.WEB_SEARCH
138
+ analysis['reasoning'].append("Web search indicators detected")
139
+ elif web_confidence > 0.25 and doc_confidence > 0.3: # Mixed signals
140
+ analysis['suggested_route'] = QueryType.HYBRID
141
+ analysis['reasoning'].append("Mixed indicators suggest hybrid approach")
142
+ else: # Default to document search when documents are available
143
+ analysis['suggested_route'] = QueryType.DOCUMENT_ONLY
144
+ analysis['reasoning'].append("Default to document search - prefer uploaded documents")
145
+
146
+ return analysis
147
+
148
+ def _get_keyword_weight(self, category: str) -> float:
149
+ """Get weight for different keyword categories"""
150
+ weights = {
151
+ 'temporal': 1.5, # Strong indicator for web search
152
+ 'explanatory': 0.8, # Medium indicator
153
+ 'comparative': 1.2, # Strong indicator
154
+ 'current_data': 1.5, # Strong indicator
155
+ 'specifications': 1.0, # Medium indicator
156
+ 'superlatives': 1.8, # Very strong indicator for web search
157
+ 'factual_queries': 1.6 # Strong indicator for web search
158
+ }
159
+ return weights.get(category, 0.5)
160
+
161
+ def should_use_web_search(self, query: str, document_results: List = None) -> Tuple[bool, str]:
162
+ """
163
+ Determine if web search should be used based on query and document results
164
+
165
+ Args:
166
+ query: User query
167
+ document_results: Results from document search (if any)
168
+
169
+ Returns:
170
+ Tuple of (should_use_web, reasoning)
171
+ """
172
+ analysis = self.analyze_query(query)
173
+
174
+ # Always use web search if suggested route is WEB_SEARCH
175
+ if analysis['suggested_route'] == QueryType.WEB_SEARCH:
176
+ return True, "Query indicates need for web search"
177
+
178
+ # For hybrid queries, be more conservative - prefer documents when available
179
+ if analysis['suggested_route'] == QueryType.HYBRID:
180
+ if not document_results or len(document_results) == 0:
181
+ return True, "Hybrid query with no document results"
182
+ elif len(document_results) > 0:
183
+ # Check quality of document results - lowered threshold to prefer documents
184
+ best_score = max([r.get('score', 0) for r in document_results])
185
+ if best_score < 0.05: # Very low similarity scores only
186
+ return True, "Hybrid query with very low-quality document results"
187
+
188
+ # For document-only queries, almost never use web search
189
+ if analysis['suggested_route'] == QueryType.DOCUMENT_ONLY:
190
+ # Only use web search if absolutely no document results
191
+ if document_results is not None and len(document_results) == 0:
192
+ return True, "No document results found, falling back to web search"
193
+
194
+ return False, "Document search should be sufficient"
195
+
196
+ def get_routing_explanation(self, query: str) -> str:
197
+ """
198
+ Get human-readable explanation of routing decision
199
+
200
+ Args:
201
+ query: User query
202
+
203
+ Returns:
204
+ Explanation string
205
+ """
206
+ analysis = self.analyze_query(query)
207
+
208
+ explanation = f"**Query Analysis for:** {query}\n\n"
209
+
210
+ if analysis['web_indicators']:
211
+ explanation += "**Web Search Indicators Found:**\n"
212
+ for indicator in analysis['web_indicators'][:3]: # Show top 3
213
+ explanation += f"- {indicator}\n"
214
+ explanation += "\n"
215
+
216
+ if analysis['document_indicators']:
217
+ explanation += "**Document Search Indicators Found:**\n"
218
+ for indicator in analysis['document_indicators']:
219
+ explanation += f"- {indicator}\n"
220
+ explanation += "\n"
221
+
222
+ explanation += f"**Suggested Strategy:** {analysis['suggested_route'].value}\n\n"
223
+
224
+ if analysis['reasoning']:
225
+ explanation += "**Reasoning:** " + ", ".join(analysis['reasoning'])
226
+
227
+ return explanation
228
+
229
+ def analyze_query_semantic(self, query: str, vector_store=None, similarity_threshold: float = 0.15) -> Dict:
230
+ """
231
+ Semantic-based query routing using embedding similarity to determine
232
+ if the query is relevant to indexed documents
233
+
234
+ Args:
235
+ query: User's input query
236
+ vector_store: VectorStore instance with indexed documents
237
+ similarity_threshold: Minimum similarity score to prefer documents (0.0-1.0)
238
+
239
+ Returns:
240
+ Dict with routing decision and reasoning
241
+ """
242
+ try:
243
+ # If no vector store or no documents, default to web search
244
+ if not vector_store or not hasattr(vector_store, 'search') or len(getattr(vector_store, 'documents', [])) == 0:
245
+ return {
246
+ 'suggested_route': QueryType.WEB_SEARCH,
247
+ 'reasoning': ['No documents available - using web search'],
248
+ 'similarity_score': 0.0
249
+ }
250
+
251
+ # Still check for strong temporal indicators that should always use web search
252
+ temporal_keywords = ['latest', 'recent', 'current', 'now', 'today', 'this year', '2024', '2025', 'breaking', 'news']
253
+ query_lower = query.lower()
254
+
255
+ for keyword in temporal_keywords:
256
+ if keyword in query_lower:
257
+ return {
258
+ 'suggested_route': QueryType.WEB_SEARCH,
259
+ 'reasoning': [f'Temporal keyword "{keyword}" detected - using web search for current information'],
260
+ 'similarity_score': 0.0
261
+ }
262
+
263
+ # Get semantic similarity with documents
264
+ try:
265
+ # Search for similar documents
266
+ results = vector_store.search(query, k=3)
267
+
268
+ if not results:
269
+ return {
270
+ 'suggested_route': QueryType.WEB_SEARCH,
271
+ 'reasoning': ['No document matches found - using web search'],
272
+ 'similarity_score': 0.0
273
+ }
274
+
275
+ # Get the best similarity score
276
+ best_score = max([r.get('score', 0) for r in results])
277
+
278
+ print(f"DEBUG: Semantic routing - Query: '{query[:50]}...', Best similarity: {best_score:.3f}, Threshold: {similarity_threshold}")
279
+
280
+ if best_score >= similarity_threshold:
281
+ return {
282
+ 'suggested_route': QueryType.DOCUMENT_ONLY,
283
+ 'reasoning': [f'High document relevance (score: {best_score:.3f}) - using document search'],
284
+ 'similarity_score': best_score
285
+ }
286
+ else:
287
+ return {
288
+ 'suggested_route': QueryType.WEB_SEARCH,
289
+ 'reasoning': [f'Low document relevance (score: {best_score:.3f}) - using web search'],
290
+ 'similarity_score': best_score
291
+ }
292
+
293
+ except Exception as search_error:
294
+ print(f"DEBUG: Semantic search failed: {search_error}")
295
+ return {
296
+ 'suggested_route': QueryType.WEB_SEARCH,
297
+ 'reasoning': ['Document search failed - using web search'],
298
+ 'similarity_score': 0.0
299
+ }
300
+
301
+ except Exception as e:
302
+ print(f"DEBUG: Semantic routing error: {e}")
303
+ # Fallback to keyword-based routing
304
+ return self.analyze_query(query)
components/vector_store.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pickle
3
+ import os
4
+ from typing import List, Dict, Tuple
5
+ import json
6
+ import re
7
+ from collections import Counter
8
+ import math
9
+ import config
10
+
11
+ # Import torch for device handling
12
+ try:
13
+ import torch
14
+ TORCH_AVAILABLE = True
15
+ except ImportError:
16
+ TORCH_AVAILABLE = False
17
+
18
+ # Import Hugging Face client
19
+ try:
20
+ from .huggingface_client import HuggingFaceEmbeddingModel
21
+ HUGGINGFACE_AVAILABLE = True
22
+ except ImportError:
23
+ HUGGINGFACE_AVAILABLE = False
24
+
25
+ # Fallback to sentence transformers
26
+ try:
27
+ import faiss
28
+ from sentence_transformers import SentenceTransformer
29
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
30
+ except ImportError:
31
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
32
+ print("Sentence transformers not available. Using TF-IDF fallback.")
33
+
34
+ class VectorStore:
35
+ """
36
+ Vector store using Sentence Transformers for embeddings and FAISS for similarity search
37
+ """
38
+
39
+ def __init__(self, model_name: str = None, index_path: str = "vector_index"):
40
+ self.model_name = model_name or config.EMBEDDING_MODEL
41
+ self.index_path = index_path
42
+ self.embedding_model = None
43
+ self.index = None
44
+ self.documents = []
45
+ self.dimension = None
46
+ self.use_huggingface = HUGGINGFACE_AVAILABLE
47
+ self.use_sentence_transformers = SENTENCE_TRANSFORMERS_AVAILABLE
48
+
49
+ if self.use_huggingface:
50
+ self._load_huggingface_model()
51
+ elif self.use_sentence_transformers:
52
+ self._load_sentence_transformer_model()
53
+ else:
54
+ self._init_simple_search()
55
+
56
+ def _load_huggingface_model(self):
57
+ """Load the Hugging Face embedding model"""
58
+ try:
59
+ self.embedding_model = HuggingFaceEmbeddingModel(self.model_name)
60
+ # Get dimension
61
+ self.dimension = self.embedding_model.get_dimension()
62
+ print(f"Loaded HuggingFace embedding model: {self.model_name} (dimension: {self.dimension})")
63
+ except Exception as e:
64
+ print(f"Error loading HuggingFace model: {str(e)}")
65
+ self.use_huggingface = False
66
+ if self.use_sentence_transformers:
67
+ self._load_sentence_transformer_model()
68
+ else:
69
+ self._init_simple_search()
70
+
71
+ def _load_sentence_transformer_model(self):
72
+ """Load the sentence transformer model for embeddings"""
73
+ try:
74
+ # Load with careful device handling - let the library handle device assignment
75
+ self.embedding_model = SentenceTransformer(
76
+ self.model_name,
77
+ device=None, # Let the library choose the best device
78
+ trust_remote_code=True
79
+ )
80
+
81
+ # Get dimension from a sample embedding
82
+ sample_embedding = self.embedding_model.encode(["sample"])
83
+ self.dimension = sample_embedding.shape[1] if hasattr(sample_embedding, 'shape') else len(sample_embedding)
84
+ print(f"Loaded sentence transformer model: {self.model_name} (dimension: {self.dimension})")
85
+ except Exception as e:
86
+ print(f"Error loading sentence transformer model: {str(e)}")
87
+ self.use_sentence_transformers = False
88
+ self._init_simple_search()
89
+
90
+ def _preprocess_text(self, text: str) -> List[str]:
91
+ """Simple text preprocessing for TF-IDF"""
92
+ # Convert to lowercase and remove punctuation
93
+ text = re.sub(r'[^\w\s]', ' ', text.lower())
94
+ # Split into words and remove empty strings
95
+ words = [word for word in text.split() if len(word) > 2]
96
+ return words
97
+
98
+ def _compute_tf(self, words: List[str]) -> Dict[str, float]:
99
+ """Compute term frequency"""
100
+ word_count = len(words)
101
+ tf_dict = {}
102
+ for word in words:
103
+ tf_dict[word] = tf_dict.get(word, 0) + 1
104
+ # Normalize by total word count
105
+ for word in tf_dict:
106
+ tf_dict[word] = tf_dict[word] / word_count
107
+ return tf_dict
108
+
109
+ def _compute_idf(self):
110
+ """Compute inverse document frequency for all terms"""
111
+ N = len(self.documents)
112
+ all_words = set()
113
+ for doc in self.documents:
114
+ words = self._preprocess_text(doc['text'])
115
+ all_words.update(set(words))
116
+
117
+ for word in all_words:
118
+ containing_docs = sum(1 for doc in self.documents
119
+ if word in self._preprocess_text(doc['text']))
120
+ self.idf_scores[word] = math.log(N / containing_docs) if containing_docs > 0 else 0
121
+
122
+ def _compute_tfidf_similarity(self, query: str, doc_text: str) -> float:
123
+ """Compute TF-IDF cosine similarity between query and document"""
124
+ query_words = self._preprocess_text(query)
125
+ doc_words = self._preprocess_text(doc_text)
126
+
127
+ if not query_words or not doc_words:
128
+ return 0.0
129
+
130
+ query_tf = self._compute_tf(query_words)
131
+ doc_tf = self._compute_tf(doc_words)
132
+
133
+ # Get all unique words
134
+ all_words = set(query_words + doc_words)
135
+
136
+ # Compute TF-IDF vectors
137
+ query_vector = []
138
+ doc_vector = []
139
+
140
+ for word in all_words:
141
+ idf = self.idf_scores.get(word, 0)
142
+ query_tfidf = query_tf.get(word, 0) * idf
143
+ doc_tfidf = doc_tf.get(word, 0) * idf
144
+ query_vector.append(query_tfidf)
145
+ doc_vector.append(doc_tfidf)
146
+
147
+ # Compute cosine similarity
148
+ if not query_vector or not doc_vector:
149
+ return 0.0
150
+
151
+ dot_product = sum(a * b for a, b in zip(query_vector, doc_vector))
152
+ query_norm = math.sqrt(sum(a * a for a in query_vector))
153
+ doc_norm = math.sqrt(sum(a * a for a in doc_vector))
154
+
155
+ if query_norm == 0 or doc_norm == 0:
156
+ return 0.0
157
+
158
+ return dot_product / (query_norm * doc_norm)
159
+
160
+ def _init_simple_search(self):
161
+ """Initialize simple TF-IDF search"""
162
+ self.vocabulary = {}
163
+ self.idf_scores = {}
164
+ print("Initialized simple TF-IDF search (advanced embeddings not available)")
165
+
166
+ def create_embeddings(self, texts: List[str]) -> np.ndarray:
167
+ """Create embeddings for a list of texts"""
168
+ if self.use_huggingface or self.use_sentence_transformers:
169
+ try:
170
+ embeddings = self.embedding_model.encode(texts)
171
+ if hasattr(embeddings, 'numpy'):
172
+ embeddings = embeddings.numpy()
173
+ return embeddings.astype('float32')
174
+ except Exception as e:
175
+ print(f"Error creating embeddings, falling back to simple search: {str(e)}")
176
+ self.use_huggingface = False
177
+ self.use_sentence_transformers = False
178
+ self._init_simple_search()
179
+
180
+ # Return dummy embeddings for simple search
181
+ return np.zeros((len(texts), 100), dtype='float32')
182
+
183
+ def initialize_index(self):
184
+ """Initialize FAISS index"""
185
+ if not (self.use_huggingface or self.use_sentence_transformers):
186
+ return
187
+
188
+ if self.dimension is None:
189
+ raise Exception("Embedding model not properly loaded")
190
+
191
+ # Use IndexFlatIP for cosine similarity (Inner Product)
192
+ self.index = faiss.IndexFlatIP(self.dimension)
193
+ print(f"Initialized FAISS index with dimension {self.dimension}")
194
+
195
+ def add_documents(self, chunks: List[Dict]):
196
+ """Add document chunks to the vector store"""
197
+ if not chunks:
198
+ return
199
+
200
+ # Store documents with metadata
201
+ for i, chunk in enumerate(chunks):
202
+ self.documents.append({
203
+ 'id': len(self.documents),
204
+ 'text': chunk['text'],
205
+ 'metadata': chunk['metadata'],
206
+ 'embedding_id': len(self.documents)
207
+ })
208
+
209
+ if self.use_huggingface or self.use_sentence_transformers:
210
+ # Initialize index if not done
211
+ if self.index is None:
212
+ self.initialize_index()
213
+
214
+ # Extract texts for embedding
215
+ texts = [chunk['text'] for chunk in chunks]
216
+
217
+ # Create embeddings
218
+ embeddings = self.create_embeddings(texts)
219
+
220
+ # Normalize embeddings for cosine similarity
221
+ faiss.normalize_L2(embeddings)
222
+
223
+ # Add to FAISS index
224
+ self.index.add(embeddings)
225
+
226
+ print(f"Added {len(chunks)} document chunks to FAISS vector store")
227
+ else:
228
+ # For simple search, compute IDF scores
229
+ self._compute_idf()
230
+ print(f"Added {len(chunks)} document chunks to simple vector store")
231
+
232
+ def search(self, query: str, k: int = 5, similarity_threshold: float = 0.0) -> List[Dict]:
233
+ """Search for similar documents using semantic similarity with very low threshold"""
234
+ if len(self.documents) == 0:
235
+ return []
236
+
237
+ if (self.use_huggingface or self.use_sentence_transformers) and self.index is not None:
238
+ return self._advanced_search(query, k, similarity_threshold)
239
+ else:
240
+ return self._simple_search(query, k, similarity_threshold)
241
+
242
+ def _advanced_search(self, query: str, k: int, similarity_threshold: float) -> List[Dict]:
243
+ """Advanced search using FAISS and sentence transformers"""
244
+ # Create query embedding
245
+ query_embedding = self.create_embeddings([query])
246
+
247
+ # Normalize for cosine similarity
248
+ faiss.normalize_L2(query_embedding)
249
+
250
+ # Search in FAISS index
251
+ scores, indices = self.index.search(query_embedding, min(k, len(self.documents)))
252
+
253
+ results = []
254
+ for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
255
+ # Filter by similarity threshold
256
+ if score >= similarity_threshold and idx < len(self.documents):
257
+ result = {
258
+ 'document': self.documents[idx],
259
+ 'score': float(score),
260
+ 'rank': i + 1
261
+ }
262
+ results.append(result)
263
+
264
+ return results
265
+
266
+ def _simple_search(self, query: str, k: int, similarity_threshold: float) -> List[Dict]:
267
+ """Simple search using improved TF-IDF similarity with better matching"""
268
+ if not self.documents:
269
+ return []
270
+
271
+ # Compute similarities
272
+ similarities = []
273
+ for doc in self.documents:
274
+ # Calculate multiple similarity scores for better matching
275
+ tfidf_similarity = self._compute_tfidf_similarity(query, doc['text'])
276
+ keyword_similarity = self._compute_keyword_similarity(query, doc['text'])
277
+ combined_similarity = max(tfidf_similarity, keyword_similarity * 0.7) # Boost keyword matches
278
+
279
+ similarities.append({
280
+ 'document': doc,
281
+ 'score': combined_similarity,
282
+ 'rank': 0 # Will be set after sorting
283
+ })
284
+
285
+ # Sort by similarity score
286
+ similarities.sort(key=lambda x: x['score'], reverse=True)
287
+
288
+ # Always return results, ignore similarity threshold for TF-IDF fallback
289
+ results = []
290
+ for i, result in enumerate(similarities[:k]):
291
+ result['rank'] = i + 1
292
+ results.append(result)
293
+
294
+ return results
295
+
296
+ def _compute_keyword_similarity(self, query: str, text: str) -> float:
297
+ """Compute simple keyword-based similarity"""
298
+ query_words = set(query.lower().split())
299
+ text_words = set(text.lower().split())
300
+
301
+ if not query_words:
302
+ return 0.0
303
+
304
+ # Calculate Jaccard similarity
305
+ intersection = query_words.intersection(text_words)
306
+ union = query_words.union(text_words)
307
+
308
+ if not union:
309
+ return 0.0
310
+
311
+ return len(intersection) / len(union)
312
+
313
+ def save_index(self):
314
+ """Save vector store to disk"""
315
+ try:
316
+ if (self.use_huggingface or self.use_sentence_transformers) and self.index is not None:
317
+ # Save FAISS index
318
+ faiss.write_index(self.index, f"{self.index_path}.faiss")
319
+
320
+ # Save documents and metadata
321
+ with open(f"{self.index_path}_docs.pkl", "wb") as f:
322
+ pickle.dump({
323
+ 'documents': self.documents,
324
+ 'dimension': self.dimension,
325
+ 'model_name': self.model_name,
326
+ 'use_huggingface': self.use_huggingface,
327
+ 'use_sentence_transformers': self.use_sentence_transformers,
328
+ 'vocabulary': getattr(self, 'vocabulary', {}),
329
+ 'idf_scores': getattr(self, 'idf_scores', {})
330
+ }, f)
331
+
332
+ print(f"Saved vector index to {self.index_path}")
333
+ except Exception as e:
334
+ print(f"Error saving index: {str(e)}")
335
+
336
+ def load_index(self):
337
+ """Load vector store from disk"""
338
+ try:
339
+ if os.path.exists(f"{self.index_path}_docs.pkl"):
340
+ # Load documents and metadata
341
+ with open(f"{self.index_path}_docs.pkl", "rb") as f:
342
+ data = pickle.load(f)
343
+ self.documents = data['documents']
344
+ self.dimension = data.get('dimension')
345
+ self.vocabulary = data.get('vocabulary', {})
346
+ self.idf_scores = data.get('idf_scores', {})
347
+ stored_use_hf = data.get('use_huggingface', False)
348
+ stored_use_st = data.get('use_sentence_transformers', data.get('use_advanced', True))
349
+
350
+ # Load FAISS index if available and we're using embeddings
351
+ if ((self.use_huggingface or self.use_sentence_transformers) and
352
+ (stored_use_hf or stored_use_st) and
353
+ os.path.exists(f"{self.index_path}.faiss")):
354
+ self.index = faiss.read_index(f"{self.index_path}.faiss")
355
+
356
+ print(f"Loaded vector index from {self.index_path}")
357
+ return True
358
+ except Exception as e:
359
+ print(f"Error loading index: {str(e)}")
360
+
361
+ return False
362
+
363
+ def clear_index(self):
364
+ """Clear the current index and documents"""
365
+ self.index = None
366
+ self.documents = []
367
+ self.vocabulary = {}
368
+ self.idf_scores = {}
369
+ print("Cleared vector index")
370
+
371
+ def get_stats(self) -> Dict:
372
+ """Get statistics about the vector store"""
373
+ return {
374
+ 'total_documents': len(self.documents),
375
+ 'index_size': self.index.ntotal if ((self.use_huggingface or self.use_sentence_transformers) and self.index) else len(self.documents),
376
+ 'dimension': self.dimension,
377
+ 'model_name': self.model_name,
378
+ 'search_type': 'HuggingFace Embeddings + FAISS' if self.use_huggingface else 'Sentence Transformers + FAISS' if self.use_sentence_transformers else 'Simple TF-IDF'
379
+ }
components/web_search.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from typing import List, Dict, Optional
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ class WebSearcher:
11
+ """
12
+ Serper.dev API integration for web search functionality
13
+ """
14
+
15
+ def __init__(self, api_key: Optional[str] = None):
16
+ self.api_key = api_key or os.getenv("SERPER_API_KEY")
17
+ self.base_url = "https://google.serper.dev/search"
18
+
19
+ if not self.api_key:
20
+ raise ValueError("Serper API key is required. Please set SERPER_API_KEY in your .env file")
21
+
22
+ def search(self, query: str, num_results: int = 5) -> Dict:
23
+ """
24
+ Perform web search using Serper API
25
+
26
+ Args:
27
+ query: Search query
28
+ num_results: Number of results to return
29
+
30
+ Returns:
31
+ Dictionary containing search results
32
+ """
33
+ headers = {
34
+ 'X-API-KEY': self.api_key,
35
+ 'Content-Type': 'application/json'
36
+ }
37
+
38
+ payload = {
39
+ 'q': query,
40
+ 'num': num_results,
41
+ 'page': 1
42
+ }
43
+
44
+ try:
45
+ response = requests.post(
46
+ self.base_url,
47
+ headers=headers,
48
+ data=json.dumps(payload),
49
+ timeout=10
50
+ )
51
+
52
+ response.raise_for_status()
53
+ return response.json()
54
+
55
+ except requests.exceptions.RequestException as e:
56
+ raise Exception(f"Web search failed: {str(e)}")
57
+
58
+ def format_search_results(self, search_response: Dict) -> List[Dict]:
59
+ """
60
+ Format search results into a standardized structure
61
+
62
+ Args:
63
+ search_response: Raw response from Serper API
64
+
65
+ Returns:
66
+ List of formatted search results
67
+ """
68
+ formatted_results = []
69
+
70
+ # Process organic results
71
+ organic_results = search_response.get('organic', [])
72
+
73
+ for i, result in enumerate(organic_results):
74
+ formatted_result = {
75
+ 'rank': i + 1,
76
+ 'title': result.get('title', ''),
77
+ 'snippet': result.get('snippet', ''),
78
+ 'link': result.get('link', ''),
79
+ 'source': result.get('displayLink', ''),
80
+ 'type': 'organic'
81
+ }
82
+ formatted_results.append(formatted_result)
83
+
84
+ # Process answer box if available
85
+ answer_box = search_response.get('answerBox')
86
+ if answer_box:
87
+ formatted_result = {
88
+ 'rank': 0, # Answer box gets top priority
89
+ 'title': answer_box.get('title', 'Direct Answer'),
90
+ 'snippet': answer_box.get('answer', answer_box.get('snippet', '')),
91
+ 'link': answer_box.get('link', ''),
92
+ 'source': answer_box.get('displayLink', 'Google'),
93
+ 'type': 'answer_box'
94
+ }
95
+ formatted_results.insert(0, formatted_result)
96
+
97
+ # Process knowledge graph if available
98
+ knowledge_graph = search_response.get('knowledgeGraph')
99
+ if knowledge_graph:
100
+ formatted_result = {
101
+ 'rank': 0,
102
+ 'title': knowledge_graph.get('title', 'Knowledge Graph'),
103
+ 'snippet': knowledge_graph.get('description', ''),
104
+ 'link': knowledge_graph.get('descriptionLink', ''),
105
+ 'source': knowledge_graph.get('source', 'Google Knowledge Graph'),
106
+ 'type': 'knowledge_graph'
107
+ }
108
+ formatted_results.insert(0 if not answer_box else 1, formatted_result)
109
+
110
+ return formatted_results
111
+
112
+ def search_and_format(self, query: str, num_results: int = 5) -> List[Dict]:
113
+ """
114
+ Perform search and return formatted results
115
+
116
+ Args:
117
+ query: Search query
118
+ num_results: Number of results to return
119
+
120
+ Returns:
121
+ List of formatted search results
122
+ """
123
+ try:
124
+ # Perform search
125
+ search_response = self.search(query, num_results)
126
+
127
+ # Format results
128
+ formatted_results = self.format_search_results(search_response)
129
+
130
+ return formatted_results
131
+
132
+ except Exception as e:
133
+ print(f"Error in web search: {str(e)}")
134
+ return []
135
+
136
+ def create_search_summary(self, results: List[Dict], max_length: int = 1000) -> str:
137
+ """
138
+ Create a summary from search results
139
+
140
+ Args:
141
+ results: List of search results
142
+ max_length: Maximum length of summary
143
+
144
+ Returns:
145
+ Summary text with sources
146
+ """
147
+ if not results:
148
+ return "No web search results found."
149
+
150
+ summary_parts = []
151
+ sources = []
152
+ current_length = 0
153
+
154
+ for result in results[:3]: # Use top 3 results for summary
155
+ snippet = result.get('snippet', '')
156
+ title = result.get('title', '')
157
+ source = result.get('source', '')
158
+ link = result.get('link', '')
159
+
160
+ if snippet and current_length + len(snippet) < max_length:
161
+ summary_parts.append(f"**{title}**: {snippet}")
162
+ if source and link:
163
+ sources.append(f"- [{source}]({link})")
164
+ current_length += len(snippet) + len(title) + 4
165
+
166
+ # Combine summary parts
167
+ summary = "\n\n".join(summary_parts)
168
+
169
+ if sources:
170
+ summary += "\n\n**Sources:**\n" + "\n".join(sources)
171
+
172
+ return summary
config.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local Hugging Face Model Settings
2
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast embedding model
3
+ CHAT_MODEL = "google/flan-t5-base" # Better for summarization and QA tasks
4
+
5
+ # Alternative chat models you can use (just change CHAT_MODEL):
6
+ # "google/flan-t5-small" (faster, smaller - 250MB)
7
+ # "google/flan-t5-base" (good balance - 990MB) - RECOMMENDED
8
+ # "google/flan-t5-large" (better quality, slower - 3GB)
9
+ # "facebook/bart-large-cnn" (excellent for summarization but larger)
10
+ # "t5-small" (good for summarization, 240MB)
11
+
12
+ # Model Settings
13
+ MODEL_MAX_LENGTH = 1000 # Maximum tokens for generation
14
+ TEMPERATURE = 0.7 # Creativity (0.0 = deterministic, 1.0 = very creative)
15
+ USE_CUDA = True # Set to False if you don't have GPU
16
+ DEVICE = "cpu" # "auto", "cuda", "cpu"
17
+ MODEL_CACHE_DIR = "./models" # Local directory to cache downloaded models
18
+
19
+ # Document Processing Settings
20
+ CHUNK_SIZE = 1000
21
+ CHUNK_OVERLAP = 200
22
+
23
+ # Vector Store Settings
24
+ SIMILARITY_THRESHOLD = 0.1
25
+ MAX_SEARCH_RESULTS = 5
26
+
27
+ # Web Search Settings
28
+ WEB_SEARCH_RESULTS = 5
29
+ WEB_SEARCH_TIMEOUT = 10
30
+
31
+ # Query Routing Settings
32
+ WEB_SEARCH_CONFIDENCE_THRESHOLD = 0.6
33
+ DOCUMENT_SEARCH_CONFIDENCE_THRESHOLD = 0.7
34
+ HYBRID_THRESHOLD = 0.3
35
+
36
+ # Fallback Settings (if local OpenAI models are not available)
37
+ USE_SENTENCE_TRANSFORMERS_FALLBACK = True
38
+ FALLBACK_EMBEDDING_MODEL = "all-MiniLM-L6-v2" # Sentence Transformers model
39
+
40
+ # UI Settings
41
+ PAGE_TITLE = "Universal Document Intelligence Chatbot"
42
+ LAYOUT = "wide"
43
+
44
+ # File Settings
45
+ SUPPORTED_FILE_TYPES = ['pdf']
46
+ MAX_FILE_SIZE_MB = 50
47
+
48
+ # Response Settings
49
+ MAX_RESPONSE_LENGTH = 2000
50
+ MAX_SOURCES_DISPLAYED = 3
requirements.txt CHANGED
@@ -1,3 +1,45 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core web framework
2
+ streamlit==1.28.0
3
+
4
+ # Machine Learning & AI
5
+ torch==2.1.0
6
+ transformers==4.35.2
7
+ sentence-transformers==2.2.2
8
+ accelerate==0.24.1
9
+
10
+ # Vector Search & Embeddings
11
+ faiss-cpu==1.7.4
12
+ numpy==1.24.3
13
+
14
+ # Document Processing
15
+ PyPDF2==3.0.1
16
+
17
+ # Data Processing
18
+ pandas==2.0.3
19
+
20
+ # Environment & Configuration
21
+ python-dotenv==1.0.0
22
+
23
+ # HTTP Requests
24
+ requests==2.31.0
25
+
26
+ # Image Processing (compatible with Streamlit)
27
+ pillow>=7.1.0,<11.0.0
28
+
29
+ # Package compatibility (compatible with Streamlit)
30
+ packaging>=16.8,<24.0
31
+
32
+ # Fast model downloads
33
+ hf_xet>=1.1.0
34
+
35
+ # Additional dependencies for sentence-transformers
36
+ scikit-learn>=1.2.0
37
+ scipy>=1.10.0
38
+ nltk>=3.8
39
+ sentencepiece>=0.1.96
40
+
41
+ # For tokenization
42
+ tokenizers>=0.13.0,<0.15.0
43
+
44
+ # For HuggingFace Hub
45
+ huggingface-hub>=0.16.0,<1.0.0