ashish-ninehertz commited on
Commit
944bdbc
·
1 Parent(s): e379072
.gitignore ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environment
7
+ venv/
8
+ .env
9
+ .venv/
10
+
11
+ .history/
12
+
13
+
14
+ # Jupyter Notebook checkpoints
15
+ .ipynb_checkpoints
16
+
17
+ # VS Code settings
18
+ .vscode/
19
+
20
+ # OS files
21
+ .DS_Store
22
+ Thumbs.db
23
+
24
+ # Logs
25
+ *.log
26
+
27
+ # Environment variable files
28
+ .env
29
+ .env.*
30
+
31
+ # Data and cache
32
+ data/
33
+ *.sqlite3
34
+ *.db
35
+
36
+ # Python egg files
37
+ *.egg
38
+ *.egg-info/
39
+ dist/
40
+ build/
41
+ .eggs/
42
+
43
+ # Qdrant local storage (if
.gitattributes → CrawlyBot/.gitattributes RENAMED
File without changes
CrawlyBot/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: CrawlyBot
3
+ emoji: 🏢
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.34.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
README.md CHANGED
@@ -1,13 +1,10 @@
1
  ---
2
- title: CrawlyBot
3
- emoji: 🏢
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.34.2
8
- app_file: app.py
9
  pinned: false
10
- license: other
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Buddy Your Bot – RAG Chatbot
3
+ emoji: 🤖
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: "4.25.0"
8
+ app_file: app/web/gradio_app.py
9
  pinned: false
10
+ ---
 
 
 
app ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 8b9665bbfca64069c80341de7a68c92b6d066bd1
app.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import uuid
3
+ import logging
4
+ from typing import List, Tuple
5
+ from app.main import RAGSystem
6
+ import asyncio
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Initialize the RAG system
13
+ rag = RAGSystem()
14
+
15
+ def create_session() -> str:
16
+ """Create a new session ID"""
17
+ return str(uuid.uuid4())
18
+
19
+ def index_website(url: str, session_id: str) -> Tuple[bool, str]:
20
+ """Index a website for a given session"""
21
+ try:
22
+ result = rag.crawl_and_index(session_id, url)
23
+ if result["status"] == "success":
24
+ return True, f"Successfully indexed {len(result.get('urls_processed', []))} pages"
25
+ return False, result.get("message", "Unknown error during indexing")
26
+ except Exception as e:
27
+ logger.error(f"Indexing error: {str(e)}")
28
+ return False, f"Error during indexing: {str(e)}"
29
+
30
+ def chat_response(
31
+ session_id: str,
32
+ message: str,
33
+ model_choice: str,
34
+ ollama_url: str,
35
+ gemini_api_key: str,
36
+ chat_history: List[dict]
37
+ ) -> Tuple[List[dict], str]:
38
+ """Generate a chat response with proper error handling"""
39
+ if not session_id:
40
+ chat_history.append({"role": "user", "content": f"🧑‍💻 {message}"})
41
+ chat_history.append({"role": "assistant", "content": "🤖 Please index a website first or enter a valid session ID"})
42
+ return chat_history, ""
43
+
44
+ try:
45
+ response = asyncio.run(rag.chat(
46
+ session_id=session_id,
47
+ question=message,
48
+ model=model_choice.lower(),
49
+ ollama_url=ollama_url if model_choice == "mistral" else None,
50
+ gemini_api_key=gemini_api_key if model_choice == "gemini" else None
51
+ ))
52
+
53
+ if response["status"] == "success":
54
+ answer = response["response"]
55
+ sources = "\n\nSources:\n" + "\n".join(
56
+ f"- {src['source_url']}" for src in response.get("sources", [])
57
+ ) if response.get("sources") else ""
58
+ full_response = f"🤖 {answer}{sources}"
59
+ else:
60
+ full_response = f"🤖 Error: {response.get('message', 'Unknown error')}"
61
+
62
+ chat_history.append({"role": "user", "content": f"🧑‍💻 {message}"})
63
+ chat_history.append({"role": "assistant", "content": full_response})
64
+ return chat_history, ""
65
+ except Exception as e:
66
+ logger.error(f"Chat error: {str(e)}")
67
+ chat_history.append({"role": "user", "content": f"🧑‍💻 {message}"})
68
+ chat_history.append({"role": "assistant", "content": f"🤖 System error: {str(e)}"})
69
+ return chat_history, ""
70
+
71
+ def toggle_model_inputs(model_choice: str) -> List[gr.update]:
72
+ """Show/hide model-specific inputs"""
73
+ if model_choice == "mistral":
74
+ return [gr.update(visible=True), gr.update(visible=False)]
75
+ return [gr.update(visible=False), gr.update(visible=True)]
76
+
77
+ def load_session(existing_session_id: str) -> Tuple[str, str]:
78
+ """Load an existing session"""
79
+ if existing_session_id:
80
+ # Here you might want to add validation if the session exists
81
+ return existing_session_id, f"Loaded existing session: {existing_session_id}"
82
+ return "", "Please enter a valid session ID"
83
+
84
+ def get_session(self, session_id: str):
85
+ # If session exists in memory, return it
86
+ if session_id in self.sessions:
87
+ return self.sessions[session_id]
88
+ # If not, check if Qdrant collection exists and has documents
89
+ collection_name = self.get_collection_name(session_id)
90
+ try:
91
+ results = self.qdrant_client.scroll(collection_name=collection_name, limit=1)
92
+ if results and results[0]:
93
+ # Rehydrate session in memory
94
+ self.sessions[session_id] = {
95
+ "documents": [], # Optionally, you can fetch all docs if needed
96
+ "history": []
97
+ }
98
+ return self.sessions[session_id]
99
+ except Exception as e:
100
+ logger.warning(f"Session {session_id} not found in Qdrant: {e}")
101
+ # If not found, return None or raise
102
+ raise ValueError("No documents indexed for this session")
103
+
104
+ # Custom CSS for better styling
105
+ custom_css = """
106
+ .gradio-container {
107
+ max-width: 1200px !important;
108
+ margin: 0 auto !important;
109
+ }
110
+ .dark .gradio-container {
111
+ background: #1e1e2e !important;
112
+ }
113
+ #chatbot {
114
+ min-height: 500px;
115
+ border-radius: 12px !important;
116
+ }
117
+ .message.user {
118
+ border-left: 4px solid #4f46e5 !important;
119
+ }
120
+ .message.assistant {
121
+ border-left: 4px solid #10b981 !important;
122
+ }
123
+ .btn-primary {
124
+ background: linear-gradient(to right, #4f46e5, #7c3aed) !important;
125
+ border: none !important;
126
+ }
127
+ .btn-primary:hover {
128
+ background: linear-gradient(to right, #4338ca, #6d28d9) !important;
129
+ }
130
+ .prose {
131
+ max-width: 100% !important;
132
+ }
133
+ """
134
+
135
+ with gr.Blocks(title="RAG Chat with Mistral/Gemini", css=custom_css, theme="soft") as demo:
136
+ # Header section
137
+ with gr.Row():
138
+ gr.Markdown("""
139
+ # 🌐 RAG Chat Assistant
140
+ ### Chat with any website using Mistral or Gemini
141
+ """)
142
+
143
+ # Session state
144
+ session_id = gr.State("")
145
+
146
+ with gr.Tabs():
147
+ with gr.TabItem("📚 Index Website"):
148
+ with gr.Row():
149
+ with gr.Column():
150
+ gr.Markdown("### Step 1: Configure and Index")
151
+ with gr.Group():
152
+ url_input = gr.Textbox(
153
+ label="Website URL to index",
154
+ placeholder="https://example.com",
155
+ interactive=True,
156
+ lines=1
157
+ )
158
+
159
+ with gr.Row():
160
+ model_choice = gr.Radio(
161
+ choices=["mistral", "gemini"],
162
+ label="Select Model",
163
+ value="mistral",
164
+ interactive=True
165
+ )
166
+
167
+ index_btn = gr.Button(
168
+ "🚀 Index Website",
169
+ variant="primary",
170
+ scale=0
171
+ )
172
+
173
+ with gr.Accordion("🔐 Model Settings", open=False):
174
+ ollama_url = gr.Textbox(
175
+ label="Ollama URL (required for Mistral)",
176
+ placeholder="http://localhost:11434",
177
+ visible=True
178
+ )
179
+
180
+ gemini_api_key = gr.Textbox(
181
+ label="Gemini API Key (required for Gemini)",
182
+ placeholder="your-api-key-here",
183
+ visible=False,
184
+ type="password"
185
+ )
186
+
187
+ status_output = gr.Textbox(
188
+ label="Status",
189
+ interactive=False,
190
+ elem_classes="prose"
191
+ )
192
+
193
+ gr.Markdown("""
194
+ **Instructions:**
195
+ 1. Enter a website URL
196
+ 2. Select your preferred model
197
+ 3. Configure model settings if needed
198
+ 4. Click 'Index Website'
199
+ """)
200
+
201
+ with gr.TabItem("💬 Chat"):
202
+ with gr.Row():
203
+ with gr.Column(scale=2):
204
+ # New session ID input for resuming sessions
205
+ with gr.Accordion("🔍 Resume Previous Session", open=False):
206
+ existing_session_input = gr.Textbox(
207
+ label="Enter existing Session ID",
208
+ placeholder="Paste your session ID here...",
209
+ interactive=True
210
+ )
211
+ load_session_btn = gr.Button(
212
+ "🔁 Load Session",
213
+ variant="secondary"
214
+ )
215
+ session_status = gr.Textbox(
216
+ label="Session Status",
217
+ interactive=False
218
+ )
219
+
220
+ chatbot = gr.Chatbot(
221
+ label="Chat History",
222
+ height=500,
223
+ avatar_images=(None, None),
224
+ show_copy_button=True,
225
+ type="messages" # Use the new format
226
+ )
227
+
228
+ with gr.Row():
229
+ message_input = gr.Textbox(
230
+ label="Type your message",
231
+ placeholder="Ask about the website content...",
232
+ interactive=True,
233
+ container=False,
234
+ scale=7,
235
+ autofocus=True
236
+ )
237
+
238
+ send_btn = gr.Button(
239
+ "Send",
240
+ variant="primary",
241
+ scale=1,
242
+ min_width=100
243
+ )
244
+
245
+ # Event handlers
246
+ model_choice.change(
247
+ fn=toggle_model_inputs,
248
+ inputs=model_choice,
249
+ outputs=[ollama_url, gemini_api_key]
250
+ )
251
+
252
+ index_btn.click(
253
+ fn=create_session,
254
+ outputs=session_id
255
+ ).success(
256
+ fn=index_website,
257
+ inputs=[url_input, session_id],
258
+ outputs=[status_output]
259
+ )
260
+
261
+ # New handler for loading existing sessions
262
+ load_session_btn.click(
263
+ fn=load_session,
264
+ inputs=[existing_session_input],
265
+ outputs=[session_id, session_status]
266
+ )
267
+
268
+ send_btn.click(
269
+ fn=chat_response,
270
+ inputs=[session_id, message_input, model_choice, ollama_url, gemini_api_key, chatbot],
271
+ outputs=[chatbot, message_input]
272
+ )
273
+
274
+ # Allow submitting with Enter key
275
+ message_input.submit(
276
+ fn=chat_response,
277
+ inputs=[session_id, message_input, model_choice, ollama_url, gemini_api_key, chatbot],
278
+ outputs=[chatbot, message_input]
279
+ )
280
+
281
+ if __name__ == "__main__":
282
+ demo.launch(
283
+ server_name="0.0.0.0",
284
+ server_port=7860,
285
+ favicon_path="https://www.gradio.app/assets/favicon.ico"
286
+ )
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.68.0
2
+ uvicorn[standard]>=0.15.0
3
+ streamlit==1.32.0
4
+ requests==2.31.0
5
+ beautifulsoup4==4.12.3
6
+ python-dotenv==1.0.0
7
+ langchain>=0.1.0
8
+ langchain-community>=0.0.28
9
+ sentence-transformers>=2.2.0
10
+ ollama>=0.1.0
11
+ httpx==0.27.0
12
+ aiohttp==3.9.3
13
+ pydantic>=2.0.0
14
+ numpy>=1.21.0
15
+ websockets
16
+ qdrant-client>=1.1.0
17
+ python-multipart>=0.0.5
18
+ python-jose[cryptography]>=3.3.0
19
+ python-dateutil>=2.8.2
20
+ gradio>=4.0
setup.py ADDED
File without changes
tests/test_connection.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import socket
2
+ from qdrant_client import QdrantClient
3
+
4
+ def check_port(host, port):
5
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
6
+ return s.connect_ex((host, port)) == 0
7
+
8
+ host = "localhost"
9
+ port = 6333
10
+
11
+ if check_port(host, port):
12
+ print(f"Port {port} is open. Testing Qdrant API...")
13
+ try:
14
+ client = QdrantClient(host=host, port=port)
15
+ print("Success! Collections:", client.get_collections())
16
+ except Exception as e:
17
+ print(f"API Error: {e}")
18
+ else:
19
+ print(f"ERROR: Port {port} is closed. Check if Qdrant is running.")
tests/test_qdrant_integration.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from qdrant_client import QdrantClient
3
+ from qdrant_client.models import VectorParams, Distance
4
+
5
+ @pytest.fixture
6
+ def qdrant_client():
7
+ return QdrantClient(host="localhost", port=6333)
8
+
9
+ def test_collection_creation(qdrant_client):
10
+ test_collection = "test_collection"
11
+ qdrant_client.recreate_collection(test_collection, vectors_config=VectorParams(size=384, distance=Distance.COSINE))
12
+ assert qdrant_client.collection_exists(test_collection)
tests/test_storage.py ADDED
File without changes
tests/test_ws.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import websockets
3
+ import json
4
+
5
+ async def test_ws():
6
+ uri = "ws://localhost:8000/ws/test-session"
7
+ async with websockets.connect(uri) as ws:
8
+ await ws.send(json.dumps({"query": "What is AI?"}))
9
+ response = await ws.recv()
10
+ print("Response:", response)
11
+
12
+ asyncio.run(test_ws())