mmcc007 commited on
Commit
748113b
·
verified ·
1 Parent(s): c62c5aa

Upload folder using huggingface_hub

Browse files
Files changed (12) hide show
  1. .env +1 -0
  2. .flaskenv +3 -0
  3. .gitignore +1 -0
  4. README.md +2 -8
  5. chat_demo.py +56 -0
  6. chatbot_ui.py +184 -0
  7. project_snapshot_no_images.json +10 -0
  8. proxy_server.py +146 -0
  9. proxy_thread.py +43 -0
  10. rag_system.py +72 -0
  11. requirements.txt +11 -0
  12. web_crawler.py +92 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=sk-proj-8BdwEtUg42Q651Jy0GPFT3BlbkFJLmIfOiZ3cXLaD5NyHqAF
.flaskenv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ FLASK_APP=proxy_server.py
2
+ FLASK_ENV=development
3
+ FLASK_DEBUG=1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Lazzloe.com
3
- emoji:
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 4.44.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: lazzloe.com
3
+ app_file: chatbot_ui.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.44.1
 
 
6
  ---
 
 
chat_demo.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import logging
3
+
4
+ # Configure logging
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class ChatDemo:
8
+ def __init__(self, rag_system):
9
+ self.client = openai.OpenAI()
10
+ self.rag_system = rag_system
11
+ self.last_context_strings = [] # Attribute to store content strings
12
+ logger.info("ChatDemo initialized with RAG system")
13
+
14
+ def chatbot(self, user_input):
15
+ logger.info(f"Received user input: {user_input}")
16
+ try:
17
+ similar_chunks = self.rag_system.process_user_query(user_input)
18
+ logger.debug(f"Retrieved {len(similar_chunks)} similar chunks from RAG system")
19
+
20
+ # Extract and store content strings
21
+ self.last_context_strings = [chunk['content'] for chunk in similar_chunks]
22
+
23
+ # Log each content string
24
+ logger.debug(f"Logging {len(self.last_context_strings)} context strings:")
25
+ for i, content in enumerate(self.last_context_strings):
26
+ logger.debug(f"Context string {i + 1}: {content[:100]}...") # Log first 100 chars to avoid extremely long log lines
27
+
28
+ context = "\n".join(self.last_context_strings)
29
+
30
+ messages = [
31
+ {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
32
+ {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_input}"}
33
+ ]
34
+
35
+ logger.debug("Sending request to OpenAI API")
36
+ response = self.client.chat.completions.create(
37
+ model="gpt-3.5-turbo",
38
+ messages=messages
39
+ )
40
+
41
+ answer = response.choices[0].message.content.strip()
42
+ logger.info("Generated response from OpenAI API")
43
+ return answer
44
+ except Exception as e:
45
+ logger.error(f"Error in chatbot method: {str(e)}", exc_info=True)
46
+ return f"An error occurred: {str(e)}"
47
+
48
+ def get_last_context_strings(self):
49
+ """
50
+ Retrieve the list of content strings from the last chatbot query.
51
+
52
+ Returns:
53
+ list: A list of strings, each representing the content of a chunk used in the last context.
54
+ """
55
+ logger.info(f"Retrieving last context strings (count: {len(self.last_context_strings)})")
56
+ return self.last_context_strings
chatbot_ui.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from web_crawler import WebCrawler
3
+ from rag_system import RAGSystem
4
+ from chat_demo import ChatDemo
5
+ import validators
6
+ import logging
7
+ from proxy_thread import ProxyThread
8
+ import requests
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Instantiate your existing classes
15
+ rag_system = RAGSystem()
16
+ chat_demo = ChatDemo(rag_system)
17
+
18
+ # Global variables to track the URL and server state
19
+ url = ""
20
+ proxy_thread = None
21
+
22
+ def start_proxy_server():
23
+ global proxy_thread
24
+ # If the proxy server is already running, stop it first
25
+ if proxy_thread and proxy_thread.is_alive():
26
+ logger.info("Stopping the existing ProxyThread...")
27
+ proxy_thread.stop()
28
+
29
+ # Start a new proxy server
30
+ proxy_thread = ProxyThread(host='localhost', port=5000)
31
+ proxy_thread.start()
32
+ logger.info("New ProxyThread started.")
33
+
34
+ def load_website(input_url):
35
+ global url
36
+ url = input_url # Update the global url variable
37
+ logger.info(f"Loading website: {url}")
38
+
39
+ # Restart the ProxyServer with the new URL
40
+ start_proxy_server()
41
+
42
+ # Proxy server expects a specific URL format with target_url
43
+ proxied_url = f"http://127.0.0.1:5000/?target_url={url}"
44
+
45
+ iframe_html = f"""
46
+ <iframe src="{proxied_url}" width="100%" height="600px"></iframe>
47
+ """
48
+ return iframe_html
49
+
50
+ def highlight_text(text):
51
+ if not url:
52
+ return "No website loaded."
53
+
54
+ logger.info(f"Highlighting text: {text}")
55
+ try:
56
+ response = requests.post(f'http://127.0.0.1:5000/set_highlight', json={"highlight": text})
57
+ if response.status_code == 200:
58
+ return "Highlight applied."
59
+ else:
60
+ return "Failed to apply highlight."
61
+ except Exception as e:
62
+ logger.error(f"Error highlighting text: {e}")
63
+ return f"Error: {e}"
64
+
65
+ def clear_highlights():
66
+ if not url:
67
+ return "No website loaded."
68
+
69
+ logger.info("Clearing highlights and reloading the website.")
70
+ start_proxy_server() # Restart the proxy server to clear highlights
71
+ return load_website(url) # Reload the current website without highlights
72
+
73
+ # Function to handle the initial URL submission
74
+ def process_url(input_url):
75
+ logger.info(f"Processing URL: {input_url}")
76
+ if not validators.url(input_url):
77
+ logger.error(f"Invalid URL submitted: {input_url}")
78
+ return "Invalid URL. Please enter a valid URL.", None
79
+
80
+ try:
81
+ # Display loading message
82
+ status_message = "Crawling website and processing data..."
83
+ logger.info(status_message)
84
+
85
+ # Instantiate WebCrawler with the provided URL
86
+ web_crawler = WebCrawler()
87
+
88
+ # Crawl the website
89
+ logger.info(f"Starting web crawl for {input_url}")
90
+ extracted_content = web_crawler.crawl(input_url, 0)
91
+ logger.info(f"Web crawl completed for {input_url}")
92
+
93
+ # Process the data with the RAG system
94
+ logger.info("Processing extracted content with RAG system")
95
+ rag_system.process_content(extracted_content)
96
+
97
+ # Load the website through the proxy
98
+ iframe_html = load_website(input_url)
99
+
100
+ logger.info("URL processing completed successfully")
101
+ return "Website content successfully crawled and processed!", [], iframe_html
102
+ except Exception as e:
103
+ logger.error(f"Error processing URL {input_url}: {str(e)}", exc_info=True)
104
+ return f"Error: {str(e)}", []
105
+
106
+ # Function to handle chatbot interactions
107
+ def chatbot_response(user_input, chat_history):
108
+ logger.info(f"Received user input: {user_input}")
109
+ try:
110
+ # Use the ChatDemo class to generate a response
111
+ logger.info("Generating chatbot response")
112
+ response = chat_demo.chatbot(user_input)
113
+ chat_history.append(["User", user_input])
114
+ chat_history.append(["Chatbot", response])
115
+
116
+ logger.info("Chatbot response generated successfully")
117
+
118
+ # Get the context strings used for the response
119
+ context_strings = chat_demo.get_last_context_strings()
120
+ logger.info(f"Retrieved {len(context_strings)} context strings")
121
+
122
+ # Highlight each context string individually
123
+ for i, context in enumerate(context_strings, 1):
124
+ highlight_result = highlight_text(context)
125
+ logger.info(f"Highlight result for context {i}: {highlight_result}")
126
+
127
+ # Prepare status message
128
+ highlight_status = f"Highlighted {len(context_strings)} context passages"
129
+ logger.info(highlight_status)
130
+
131
+ # Update the chat history and return
132
+ return chat_history, chat_history, highlight_status
133
+ except Exception as e:
134
+ logger.error(f"Error in chatbot_response: {str(e)}", exc_info=True)
135
+ return [[f"Error: {str(e)}"], chat_history], chat_history, f"Error: {str(e)}"
136
+
137
+ # Function to reset the application
138
+ def reset_app():
139
+ global url
140
+ url = "" # Clear the global URL
141
+ logger.info("Resetting application and proxy server")
142
+ start_proxy_server() # Restart the proxy server for a fresh session
143
+ return "", [], "", ""
144
+
145
+ # Build the Gradio interface
146
+ with gr.Blocks() as demo:
147
+ gr.Markdown("# Website Concierge")
148
+
149
+ with gr.Row():
150
+ with gr.Column(scale=1):
151
+ url_input = gr.Textbox(placeholder="Enter a website URL", label="Website URL", interactive=True)
152
+ submit_button = gr.Button("Submit URL")
153
+ status_message = gr.Textbox(label="Status", interactive=False)
154
+
155
+ chat_history = gr.State(value=[])
156
+ chatbot_output = gr.Chatbot(label="Chat History")
157
+ user_input = gr.Textbox(placeholder="Ask the chatbot...", label="User Input", interactive=True)
158
+
159
+ highlight_status = gr.Textbox(label="Highlight Status", interactive=False)
160
+
161
+ clear_button = gr.Button("Clear Highlights")
162
+ reset_button = gr.Button("Change Website")
163
+
164
+ with gr.Column(scale=1):
165
+ proxied_view = gr.HTML(label="Website View")
166
+
167
+ # Initial URL submission
168
+ submit_button.click(process_url, inputs=url_input, outputs=[status_message, chat_history, proxied_view], queue=True)
169
+
170
+ # Handle chatbot responses
171
+ user_input.submit(chatbot_response, inputs=[user_input, chat_history], outputs=[chatbot_output, chat_history, highlight_status])
172
+
173
+ # Handle clearing highlights
174
+ clear_button.click(clear_highlights, outputs=[proxied_view])
175
+
176
+ # Handle reset button click
177
+ reset_button.click(reset_app, outputs=[url_input, chat_history, status_message, proxied_view])
178
+
179
+ # Launch the app
180
+ if __name__ == "__main__":
181
+ logger.info("Starting Gradio application")
182
+ start_proxy_server() # Start with an initial ProxyServer
183
+ demo.launch()
184
+ logger.info("Gradio application stopped")
project_snapshot_no_images.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ ".env": "OPENAI_API_KEY=sk-proj-8BdwEtUg42Q651Jy0GPFT3BlbkFJLmIfOiZ3cXLaD5NyHqAF\n",
3
+ "proxy_server.py": "import time\nfrom flask import Flask, request, Response, session, jsonify\nfrom flask_socketio import SocketIO, emit\nfrom werkzeug.serving import make_server\nimport requests\nimport re\nimport logging\nimport threading\n\nclass ProxyServer:\n def __init__(self, secret_key, host='localhost', port=5000):\n self.host = host\n self.port = port\n self.app = Flask(__name__)\n self.app.secret_key = secret_key\n self.server = None\n self.is_running = False\n self.socketio = SocketIO(self.app) # Initialize SocketIO with the Flask app\n # self.server_thread = None # Thread for running the server\n # self.server_running = False # Flag to track server state\n self.setup_routes()\n self.highlight_word = None # Initialize the highlight word\n \n @self.app.route('/shutdown', methods=['POST'])\n def shutdown():\n self.logger.info(\"Shutdown request received\")\n self.shutdown_server()\n return 'Server shutting down...'\n \n # Setup logging\n self.logger = logging.getLogger('ProxyServer')\n self.logger.setLevel(logging.DEBUG) # Set the logging level (DEBUG, INFO, WARNING, ERROR)\n handler = logging.StreamHandler() # Log to standard output (can be customized to log to a file)\n handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))\n self.logger.addHandler(handler)\n\n self.logger.info(\"Proxy server initialized\")\n\n # Inject JavaScript into HTML content to highlight words and listen for WebSocket updates\n def inject_script(self, content):\n # Inject the WebSocket listening script\n script = f\"\"\"\n <script src=\"https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js\"></script>\n <script>\n let currentHighlight = \"{self.highlight_word}\";\n function highlightWord(word) {{\n if (word) {{\n document.body.innerHTML = document.body.innerHTML.replace(\n new RegExp(word, 'g'),\n '<span style=\"background-color: yellow;\">' + word + '</span>'\n );\n }}\n }}\n highlightWord(currentHighlight);\n\n // Connect to WebSocket\n const socket = io();\n socket.on('new_highlight', function(data) {{\n currentHighlight = data.highlight;\n highlightWord(currentHighlight);\n }});\n </script>\n \"\"\"\n return re.sub(r'</body>', script + '</body>', content)\n\n # Ensure the target_url and path are handled correctly\n def build_full_url(self, target_url, path):\n if not target_url.endswith('/') and not path.startswith('/'):\n return f\"{target_url}/{path}\"\n return f\"{target_url}{path}\"\n\n # Route handler for proxying requests\n def proxy(self, path=''):\n target_url = request.args.get('target_url')\n if not target_url and 'target_url' in session:\n target_url = session['target_url']\n elif target_url:\n session['target_url'] = target_url\n\n if not target_url:\n self.logger.error(\"No target_url provided\")\n return \"Error: target_url query parameter is required\", 400\n\n full_target_url = self.build_full_url(target_url, path)\n self.logger.info(f\"Proxying request to {full_target_url}\")\n\n headers = {key: value for key, value in request.headers if key != 'Host'}\n\n # Handle POST or GET requests\n if request.method == 'POST':\n response = requests.post(full_target_url, headers=headers, data=request.get_data(), stream=True)\n else:\n response = requests.get(full_target_url, headers=headers, stream=True)\n\n # If it's HTML content, inject the script\n if 'text/html' in response.headers.get('Content-Type', ''):\n def generate():\n for chunk in response.iter_content(chunk_size=1024):\n if chunk:\n rewritten_chunk = self.inject_script(chunk.decode('utf-8'))\n yield rewritten_chunk.encode('utf-8')\n self.logger.info(f\"Injecting script into HTML response from {full_target_url}\")\n return Response(generate(), content_type=response.headers['Content-Type'])\n\n # Stream non-HTML content (images, scripts, etc.)\n else:\n def generate():\n for chunk in response.iter_content(chunk_size=1024):\n if chunk:\n yield chunk\n return Response(generate(), content_type=response.headers['Content-Type'])\n\n # API endpoint to set a new highlight word\n def set_highlight(self):\n new_highlight = request.json.get('highlight')\n if new_highlight:\n self.highlight_word = new_highlight\n # Emit the new highlight word to all connected clients\n self.socketio.emit('new_highlight', {'highlight': new_highlight})\n self.logger.info(f\"Highlight word updated to '{new_highlight}' and broadcasted to clients\")\n return jsonify({\"message\": \"Highlight word updated\", \"highlight\": new_highlight}), 200\n self.logger.error(\"No highlight word provided\")\n return jsonify({\"error\": \"No highlight word provided\"}), 400\n\n # Setup routes to proxy all requests and WebSocket events\n def setup_routes(self):\n self.app.add_url_rule('/', defaults={'path': ''}, view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n self.app.add_url_rule('/<path:path>', view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n self.app.add_url_rule('/set_highlight', view_func=self.set_highlight, methods=['POST'])\n\n def run(self):\n \"\"\"Runs the Werkzeug server\"\"\"\n logging.info(f\"Starting server on {self.host}:{self.port}\")\n self.server = make_server(self.host, self.port, self.app, threaded=True)\n self.is_running = True\n self.server.serve_forever()\n\n def shutdown_server(self):\n \"\"\"Shuts down the Werkzeug server\"\"\"\n if self.server:\n self.logger.info(\"Shutting down server...\")\n self.is_running = False\n self.server.shutdown()\n self.logger.info(\"Server shut down complete\")\n\n# Create an instance of ProxyServer and expose the app\n# proxy_server = ProxyServer(secret_key='your_secret_key_here')\n# app = proxy_server.app # Expose the Flask app to the top-level scope for Flask CLI\n\n# if __name__ == '__main__':\n# proxy_server.run(port=5000)\n",
4
+ "rag_system.py": "import openai\nimport faiss\nimport numpy as np\nimport sys\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom chat_demo import ChatDemo\nfrom web_crawler import WebCrawler \n\nclass RAGSystem:\n def __init__(self, model_name=\"text-embedding-ada-002\"):\n self.client = openai.OpenAI()\n self.model_name = model_name\n self.index = None\n self.faiss_data = []\n\n def split_into_chunks(self, page_data, max_chunk_size=500):\n chunks = []\n for page in page_data:\n url = page['url']\n for paragraph in page['paragraphs']:\n if len(paragraph) <= max_chunk_size:\n chunks.append({'content': paragraph, 'url': url})\n else:\n # Break long paragraphs into smaller chunks\n for i in range(0, len(paragraph), max_chunk_size):\n chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})\n return chunks\n\n def compute_embeddings(self, text_chunks):\n texts = [chunk['content'] for chunk in text_chunks]\n response = self.client.embeddings.create(model=self.model_name, input=texts)\n embeddings = [result.embedding for result in response.data]\n return embeddings\n\n def store_embeddings_in_faiss(self, embeddings, text_chunks):\n # Initialize a FAISS index\n dimension = len(embeddings[0])\n self.index = faiss.IndexFlatL2(dimension)\n \n for idx, embedding in enumerate(embeddings):\n np_embedding = np.array(embedding, dtype='float32')\n self.index.add(np_embedding.reshape(1, -1))\n self.faiss_data.append({\n 'embedding': np_embedding,\n 'content': text_chunks[idx]['content'],\n 'url': text_chunks[idx]['url']\n })\n\n def process_content(self, website_data):\n # Split data into chunks\n text_chunks = self.split_into_chunks(website_data)\n \n # Compute embeddings and create vector database\n embeddings = self.compute_embeddings(text_chunks)\n self.store_embeddings_in_faiss(embeddings, text_chunks)\n\n def process_user_query(self, query):\n # Compute the embedding of the query\n response = self.client.embeddings.create(model=self.model_name, input=[query])\n query_embedding = response.data[0].embedding\n np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)\n\n # Find the most similar embeddings\n distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results\n similar_chunks = [self.faiss_data[i] for i in indices[0]]\n\n return similar_chunks\n\n# Main function\nif __name__ == \"__main__\":\n if len(sys.argv) != 2:\n print(\"Usage: python rag_manager.py <url>\")\n sys.exit(1)\n\n url = sys.argv[1]\n # Crawl the website\n crawler = WebCrawler()\n website_data = crawler.crawl(url, 2)\n print(\"website data\", website_data)\n\n # Initialize RAGManager\n rag_system = RAGSystem()\n\n # Process content from page data\n rag_system.process_content(website_data)\n\n # Initialize ChatDemo with RAGManager\n chat_demo = ChatDemo(rag_system)\n\n # Command-line interaction loop for user queries\n while True:\n user_query = input(\"You: \")\n if user_query.lower() in [\"exit\", \"quit\"]:\n print(\"Goodbye!\")\n break\n response = chat_demo.chatbot(user_query)\n print(f\"Assistant: {response}\")",
5
+ "web_crawler.py": "import requests\nfrom bs4 import BeautifulSoup\nfrom urllib.parse import urljoin, urlparse\nimport json\nimport re\nimport time\nimport sys\n\nclass WebCrawler:\n def __init__(self, max_depth=2):\n self.base_url = None\n self.visited = set()\n self.max_depth = max_depth\n self.data = []\n self.session = requests.Session()\n self.delay = 0.1 # Delay between requests to prevent overwhelming the server\n\n def can_crawl(self, url):\n \"\"\"Check robots.txt to see if crawling is allowed.\"\"\"\n parsed_url = urlparse(url)\n robots_url = urljoin(f\"{parsed_url.scheme}://{parsed_url.netloc}\", '/robots.txt')\n try:\n response = self.session.get(robots_url, timeout=10)\n if response.status_code == 200:\n disallowed_paths = re.findall(r'Disallow: (.+)', response.text)\n for path in disallowed_paths:\n if url.startswith(urljoin(self.base_url, path.strip())):\n return False\n except requests.RequestException:\n pass\n return True\n\n def fetch(self, url):\n \"\"\"Fetch the content of a URL.\"\"\"\n try:\n response = self.session.get(url, timeout=10)\n response.raise_for_status()\n return response.text\n except requests.RequestException as e:\n print(f\"Error fetching {url}: {e}\")\n return None\n\n def parse(self, html_content, url):\n \"\"\"Parse the HTML content and extract headings, paragraphs, and links.\"\"\"\n soup = BeautifulSoup(html_content, 'html.parser')\n page_data = {\n 'url': url,\n 'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],\n 'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],\n }\n self.data.append(page_data)\n return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]\n\n def crawl(self, url, depth):\n \"\"\"Crawl a given URL up to a certain depth.\"\"\"\n if depth > self.max_depth or url in self.visited or not self.can_crawl(url):\n return\n\n print(f\"Crawling: {url} at depth {depth}\")\n self.base_url = url\n self.visited.add(url)\n html_content = self.fetch(url)\n if html_content:\n links = self.parse(html_content, url)\n for link in links:\n if link.startswith(self.base_url): # Stay within the same domain\n time.sleep(self.delay) # Respectful crawling\n self.crawl(link, depth + 1)\n return self.get_data()\n\n def get_data(self):\n \"\"\"Return the crawled data.\"\"\"\n return self.data\n\nif __name__ == \"__main__\":\n if len(sys.argv) != 2:\n print(\"Usage: python web_crawler.py <URL>\")\n sys.exit(1)\n\n base_url = sys.argv[1]\n crawler = WebCrawler(max_depth=2)\n data = crawler.crawl(base_url, 0)\n print(json.dumps(data, indent=4))\n",
6
+ ".gitignore": "__pycache__\n",
7
+ "chatbot_ui.py": "import gradio as gr\nfrom web_crawler import WebCrawler\nfrom rag_system import RAGSystem\nfrom chat_demo import ChatDemo\nimport validators\n\n# Instantiate your existing classes\nrag_system = RAGSystem()\nchat_demo = ChatDemo(rag_system)\n\n# Function to handle the initial URL submission\ndef process_url(url):\n if not validators.url(url):\n return \"Invalid URL. Please enter a valid URL.\", None\n \n try:\n # Display loading message\n status_message = \"Crawling website and processing data...\"\n \n # Instantiate WebCrawler with the provided URL\n web_crawler = WebCrawler()\n \n # Crawl the website\n extracted_content = web_crawler.crawl(url, 0)\n \n # Process the data with the RAG system\n rag_system.process_content(extracted_content)\n\n # Display the website content in an iframe\n iframe_html = f'<iframe src=\"{url}\" width=\"100%\" height=\"500px\"></iframe>'\n \n return \"Website content successfully crawled and processed!\", [], iframe_html\n except Exception as e:\n return f\"Error: {str(e)}\", []\n\n# Function to handle chatbot interactions\ndef chatbot_response(user_input, chat_history):\n try:\n # Use the ChatDemo class to generate a response\n response = chat_demo.chatbot(user_input)\n chat_history.append([\"User\", user_input])\n chat_history.append([\"Chatbot\", response])\n \n # Update the chat history and return\n return chat_history, chat_history\n except Exception as e:\n return [[\"Error\", str(e)]], chat_history\n\n# Function to reset the application\ndef reset_app():\n return \"\", [], \"\", \"\"\n\n# Build the Gradio interface\nwith gr.Blocks() as demo:\n url_input = gr.Textbox(placeholder=\"Enter a website URL\", label=\"Website URL\", interactive=True)\n submit_button = gr.Button(\"Submit URL\")\n status_message = gr.Textbox(label=\"Status\", interactive=False)\n chat_history = gr.State(value=[])\n chatbot_output = gr.Chatbot(label=\"Chat History\")\n user_input = gr.Textbox(placeholder=\"Ask the chatbot...\", label=\"User Input\", interactive=True)\n embedded_view = gr.HTML(label=\"Website View\")\n reset_button = gr.Button(\"Change Website\")\n\n # Initial URL submission\n submit_button.click(process_url, inputs=url_input, outputs=[status_message, chat_history, embedded_view], queue=True)\n\n # Handle chatbot responses\n user_input.submit(chatbot_response, inputs=[user_input, chat_history], outputs=[chatbot_output, chat_history])\n\n # Handle reset button click\n reset_button.click(reset_app, outputs=[url_input, chat_history, status_message, embedded_view])\n\n# Launch the app\ndemo.launch()",
8
+ ".flaskenv": "FLASK_APP=proxy_server.py\nFLASK_ENV=development\nFLASK_DEBUG=1\n",
9
+ "proxy_thread.py": "import threading\nimport requests\nimport logging\nimport time\nfrom proxy_server import ProxyServer\n\nlogging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')\nlogger = logging.getLogger(__name__)\n\nclass ProxyThread(threading.Thread):\n def __init__(self, host='localhost', port=5000):\n threading.Thread.__init__(self)\n secret_key='your_secret_key_here'\n self.proxy_server = ProxyServer(secret_key, host, port)\n\n def run(self):\n \"\"\"Starts the proxy server in a thread\"\"\"\n logger.info(\"ProxyThread starting\")\n self.proxy_server.run()\n\n def stop(self):\n \"\"\"Stops the proxy server by sending a shutdown request\"\"\"\n logger.info(\"Stopping ProxyThread\")\n try:\n response = requests.post(f'http://{self.proxy_server.host}:{self.proxy_server.port}/shutdown', timeout=5)\n logger.info(f\"Shutdown request sent. Response: {response.text}\")\n except requests.RequestException as e:\n logger.error(f\"Error sending shutdown request: {e}\")\n\n # Wait for the server to shut down\n timeout = 10\n start_time = time.time()\n while self.proxy_server.is_running and time.time() - start_time < timeout:\n time.sleep(0.1)\n\n if self.proxy_server.is_running:\n logger.warning(\"Server did not shut down in time\")\n else:\n logger.info(\"Server has been shut down\")\n\n self.join(timeout=5)\n logger.info(\"ProxyThread stopped\")\n\n"
10
+ }
proxy_server.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from flask import Flask, request, Response, session, jsonify
3
+ from flask_socketio import SocketIO, emit
4
+ from werkzeug.serving import make_server
5
+ import requests
6
+ import re
7
+ import logging
8
+
9
+ # Configure logging
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class ProxyServer:
13
+ def __init__(self, secret_key, host='localhost', port=5000):
14
+ self.host = host
15
+ self.port = port
16
+ self.app = Flask(__name__)
17
+ self.app.secret_key = secret_key
18
+ self.server = None
19
+ self.is_running = False
20
+ self.socketio = SocketIO(self.app) # Initialize SocketIO with the Flask app
21
+ # self.server_thread = None # Thread for running the server
22
+ # self.server_running = False # Flag to track server state
23
+ self.setup_routes()
24
+ self.highlight_word = None # Initialize the highlight word
25
+
26
+ @self.app.route('/shutdown', methods=['POST'])
27
+ def shutdown():
28
+ logger.info("Shutdown request received")
29
+ self.shutdown_server()
30
+ return 'Server shutting down...'
31
+
32
+ logger.info("Proxy server initialized")
33
+
34
+ # Inject JavaScript into HTML content to highlight words and listen for WebSocket updates
35
+ def inject_script(self, content):
36
+ # Inject the WebSocket listening script
37
+ script = f"""
38
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js"></script>
39
+ <script>
40
+ let currentHighlight = "{self.highlight_word}";
41
+ function highlightWord(word) {{
42
+ if (word) {{
43
+ document.body.innerHTML = document.body.innerHTML.replace(
44
+ new RegExp(word, 'g'),
45
+ '<span style="background-color: yellow;">' + word + '</span>'
46
+ );
47
+ }}
48
+ }}
49
+ highlightWord(currentHighlight);
50
+
51
+ // Connect to WebSocket
52
+ const socket = io();
53
+ socket.on('new_highlight', function(data) {{
54
+ currentHighlight = data.highlight;
55
+ highlightWord(currentHighlight);
56
+ }});
57
+ </script>
58
+ """
59
+ return re.sub(r'</body>', script + '</body>', content)
60
+
61
+ # Ensure the target_url and path are handled correctly
62
+ def build_full_url(self, target_url, path):
63
+ if not target_url.endswith('/') and not path.startswith('/'):
64
+ return f"{target_url}/{path}"
65
+ return f"{target_url}{path}"
66
+
67
+ # Route handler for proxying requests
68
+ def proxy(self, path=''):
69
+ target_url = request.args.get('target_url')
70
+ if not target_url and 'target_url' in session:
71
+ target_url = session['target_url']
72
+ elif target_url:
73
+ session['target_url'] = target_url
74
+
75
+ if not target_url:
76
+ logger.error("No target_url provided")
77
+ return "Error: target_url query parameter is required", 400
78
+
79
+ full_target_url = self.build_full_url(target_url, path)
80
+ logger.info(f"Proxying request to {full_target_url}")
81
+
82
+ headers = {key: value for key, value in request.headers if key != 'Host'}
83
+
84
+ # Handle POST or GET requests
85
+ if request.method == 'POST':
86
+ response = requests.post(full_target_url, headers=headers, data=request.get_data(), stream=True)
87
+ else:
88
+ response = requests.get(full_target_url, headers=headers, stream=True)
89
+
90
+ # If it's HTML content, inject the script
91
+ if 'text/html' in response.headers.get('Content-Type', ''):
92
+ def generate():
93
+ for chunk in response.iter_content(chunk_size=1024):
94
+ if chunk:
95
+ rewritten_chunk = self.inject_script(chunk.decode('utf-8'))
96
+ yield rewritten_chunk.encode('utf-8')
97
+ logger.info(f"Injecting script into HTML response from {full_target_url}")
98
+ return Response(generate(), content_type=response.headers['Content-Type'])
99
+
100
+ # Stream non-HTML content (images, scripts, etc.)
101
+ else:
102
+ def generate():
103
+ for chunk in response.iter_content(chunk_size=1024):
104
+ if chunk:
105
+ yield chunk
106
+ return Response(generate(), content_type=response.headers['Content-Type'])
107
+
108
+ # API endpoint to set a new highlight word
109
+ def set_highlight(self):
110
+ new_highlight = request.json.get('highlight')
111
+ if new_highlight:
112
+ self.highlight_word = new_highlight
113
+ # Emit the new highlight word to all connected clients
114
+ self.socketio.emit('new_highlight', {'highlight': new_highlight})
115
+ logger.info(f"Highlight word updated to '{new_highlight}' and broadcasted to clients")
116
+ return jsonify({"message": "Highlight word updated", "highlight": new_highlight}), 200
117
+ logger.error("No highlight word provided")
118
+ return jsonify({"error": "No highlight word provided"}), 400
119
+
120
+ # Setup routes to proxy all requests and WebSocket events
121
+ def setup_routes(self):
122
+ self.app.add_url_rule('/', defaults={'path': ''}, view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])
123
+ self.app.add_url_rule('/<path:path>', view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])
124
+ self.app.add_url_rule('/set_highlight', view_func=self.set_highlight, methods=['POST'])
125
+
126
+ def run(self):
127
+ """Runs the Werkzeug server"""
128
+ logging.info(f"Starting server on {self.host}:{self.port}")
129
+ self.server = make_server(self.host, self.port, self.app, threaded=True)
130
+ self.is_running = True
131
+ self.server.serve_forever()
132
+
133
+ def shutdown_server(self):
134
+ """Shuts down the Werkzeug server"""
135
+ if self.server:
136
+ logger.info("Shutting down server...")
137
+ self.is_running = False
138
+ self.server.shutdown()
139
+ logger.info("Server shut down complete")
140
+
141
+ # Create an instance of ProxyServer and expose the app
142
+ # proxy_server = ProxyServer(secret_key='your_secret_key_here')
143
+ # app = proxy_server.app # Expose the Flask app to the top-level scope for Flask CLI
144
+
145
+ # if __name__ == '__main__':
146
+ # proxy_server.run(port=5000)
proxy_thread.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import requests
3
+ import logging
4
+ import time
5
+ from proxy_server import ProxyServer
6
+
7
+ # Configure logging
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class ProxyThread(threading.Thread):
11
+ def __init__(self, host='localhost', port=5000):
12
+ threading.Thread.__init__(self)
13
+ secret_key='your_secret_key_here'
14
+ self.proxy_server = ProxyServer(secret_key, host, port)
15
+
16
+ def run(self):
17
+ """Starts the proxy server in a thread"""
18
+ logger.info("ProxyThread starting")
19
+ self.proxy_server.run()
20
+
21
+ def stop(self):
22
+ """Stops the proxy server by sending a shutdown request"""
23
+ logger.info("Stopping ProxyThread")
24
+ try:
25
+ response = requests.post(f'http://{self.proxy_server.host}:{self.proxy_server.port}/shutdown', timeout=5)
26
+ logger.info(f"Shutdown request sent. Response: {response.text}")
27
+ except requests.RequestException as e:
28
+ logger.error(f"Error sending shutdown request: {e}")
29
+
30
+ # Wait for the server to shut down
31
+ timeout = 10
32
+ start_time = time.time()
33
+ while self.proxy_server.is_running and time.time() - start_time < timeout:
34
+ time.sleep(0.1)
35
+
36
+ if self.proxy_server.is_running:
37
+ logger.warning("Server did not shut down in time")
38
+ else:
39
+ logger.info("Server has been shut down")
40
+
41
+ self.join(timeout=5)
42
+ logger.info("ProxyThread stopped")
43
+
rag_system.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import faiss
3
+ import numpy as np
4
+ import logging
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ # Configure logging
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class RAGSystem:
11
+ def __init__(self, model_name="text-embedding-ada-002"):
12
+ self.client = openai.OpenAI()
13
+ self.model_name = model_name
14
+ self.index = None
15
+ self.faiss_data = []
16
+ logger.info(f"RAGSystem initialized with model: {model_name}")
17
+
18
+ def split_into_chunks(self, page_data, max_chunk_size=500):
19
+ logger.info(f"Splitting data into chunks with max size: {max_chunk_size}")
20
+ chunks = []
21
+ for page in page_data:
22
+ url = page['url']
23
+ for paragraph in page['paragraphs']:
24
+ if len(paragraph) <= max_chunk_size:
25
+ chunks.append({'content': paragraph, 'url': url})
26
+ else:
27
+ # Break long paragraphs into smaller chunks
28
+ for i in range(0, len(paragraph), max_chunk_size):
29
+ chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})
30
+ logger.debug(f"Created {len(chunks)} chunks")
31
+ return chunks
32
+
33
+ def compute_embeddings(self, text_chunks):
34
+ logger.info(f"Computing embeddings for {len(text_chunks)} chunks")
35
+ texts = [chunk['content'] for chunk in text_chunks]
36
+ response = self.client.embeddings.create(model=self.model_name, input=texts)
37
+ embeddings = [result.embedding for result in response.data]
38
+ logger.debug(f"Computed {len(embeddings)} embeddings")
39
+ return embeddings
40
+
41
+ def store_embeddings_in_faiss(self, embeddings, text_chunks):
42
+ logger.info("Storing embeddings in FAISS index")
43
+ dimension = len(embeddings[0])
44
+ self.index = faiss.IndexFlatL2(dimension)
45
+
46
+ for idx, embedding in enumerate(embeddings):
47
+ np_embedding = np.array(embedding, dtype='float32')
48
+ self.index.add(np_embedding.reshape(1, -1))
49
+ self.faiss_data.append({
50
+ 'embedding': np_embedding,
51
+ 'content': text_chunks[idx]['content'],
52
+ 'url': text_chunks[idx]['url']
53
+ })
54
+ logger.debug(f"Stored {len(embeddings)} embeddings in FAISS index")
55
+
56
+ def process_content(self, website_data):
57
+ logger.info("Processing website content")
58
+ text_chunks = self.split_into_chunks(website_data)
59
+ embeddings = self.compute_embeddings(text_chunks)
60
+ self.store_embeddings_in_faiss(embeddings, text_chunks)
61
+ logger.info("Content processing completed")
62
+
63
+ def process_user_query(self, query):
64
+ logger.info(f"Processing user query: {query}")
65
+ response = self.client.embeddings.create(model=self.model_name, input=[query])
66
+ query_embedding = response.data[0].embedding
67
+ np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)
68
+
69
+ distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results
70
+ similar_chunks = [self.faiss_data[i] for i in indices[0]]
71
+ logger.debug(f"Retrieved {len(similar_chunks)} similar chunks for the query")
72
+ return similar_chunks
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.12.3
2
+ faiss_cpu==1.8.0.post1
3
+ Flask==3.0.3
4
+ Flask_SocketIO==5.4.1
5
+ gradio==4.44.1
6
+ numpy==2.1.2
7
+ openai==1.51.1
8
+ Requests==2.32.3
9
+ scikit_learn==1.5.2
10
+ validators==0.28.1
11
+ Werkzeug==3.0.4
web_crawler.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urljoin, urlparse
4
+ import json
5
+ import re
6
+ import time
7
+ import sys
8
+ import logging
9
+
10
+ # Configure logging
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class WebCrawler:
14
+ def __init__(self, max_depth=2):
15
+ self.base_url = None
16
+ self.visited = set()
17
+ self.max_depth = max_depth
18
+ self.data = []
19
+ self.session = requests.Session()
20
+ self.delay = 0.1 # Delay between requests to prevent overwhelming the server
21
+ logger.info(f"WebCrawler initialized with max_depth: {max_depth}")
22
+
23
+ def can_crawl(self, url):
24
+ logger.debug(f"Checking if can crawl: {url}")
25
+ parsed_url = urlparse(url)
26
+ robots_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", '/robots.txt')
27
+ try:
28
+ response = self.session.get(robots_url, timeout=10)
29
+ if response.status_code == 200:
30
+ disallowed_paths = re.findall(r'Disallow: (.+)', response.text)
31
+ for path in disallowed_paths:
32
+ if url.startswith(urljoin(self.base_url, path.strip())):
33
+ logger.info(f"Crawling not allowed for: {url}")
34
+ return False
35
+ except requests.RequestException:
36
+ logger.warning(f"Error fetching robots.txt for {url}", exc_info=True)
37
+ logger.debug(f"Crawling allowed for: {url}")
38
+ return True
39
+
40
+ def fetch(self, url):
41
+ logger.info(f"Fetching content from: {url}")
42
+ try:
43
+ response = self.session.get(url, timeout=10)
44
+ response.raise_for_status()
45
+ logger.debug(f"Successfully fetched content from: {url}")
46
+ return response.text
47
+ except requests.RequestException as e:
48
+ logger.error(f"Error fetching {url}: {e}", exc_info=True)
49
+ return None
50
+
51
+ def parse(self, html_content, url):
52
+ logger.info(f"Parsing HTML content from: {url}")
53
+ soup = BeautifulSoup(html_content, 'html.parser')
54
+ page_data = {
55
+ 'url': url,
56
+ 'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],
57
+ 'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],
58
+ }
59
+ self.data.append(page_data)
60
+ links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
61
+ logger.debug(f"Parsed {len(links)} links from {url}")
62
+ return links
63
+
64
+ def crawl(self, url, depth):
65
+ if depth > self.max_depth or url in self.visited or not self.can_crawl(url):
66
+ return
67
+
68
+ logger.info(f"Crawling: {url} at depth {depth}")
69
+ self.base_url = url
70
+ self.visited.add(url)
71
+ html_content = self.fetch(url)
72
+ if html_content:
73
+ links = self.parse(html_content, url)
74
+ for link in links:
75
+ if link.startswith(self.base_url): # Stay within the same domain
76
+ time.sleep(self.delay) # Respectful crawling
77
+ self.crawl(link, depth + 1)
78
+ return self.get_data()
79
+
80
+ def get_data(self):
81
+ logger.info(f"Returning crawled data: {len(self.data)} pages")
82
+ return self.data
83
+
84
+ if __name__ == "__main__":
85
+ if len(sys.argv) != 2:
86
+ print("Usage: python web_crawler.py <URL>")
87
+ sys.exit(1)
88
+
89
+ base_url = sys.argv[1]
90
+ crawler = WebCrawler(max_depth=2)
91
+ data = crawler.crawl(base_url, 0)
92
+ print(json.dumps(data, indent=4))