Spaces:
Sleeping
Sleeping
| { | |
| ".env": "OPENAI_API_KEY=sk-proj-8BdwEtUg42Q651Jy0GPFT3BlbkFJLmIfOiZ3cXLaD5NyHqAF\n", | |
| "proxy_server.py": "import time\nfrom flask import Flask, request, Response, session, jsonify\nfrom flask_socketio import SocketIO, emit\nfrom werkzeug.serving import make_server\nimport requests\nimport re\nimport logging\nimport threading\n\nclass ProxyServer:\n def __init__(self, secret_key, host='localhost', port=5000):\n self.host = host\n self.port = port\n self.app = Flask(__name__)\n self.app.secret_key = secret_key\n self.server = None\n self.is_running = False\n self.socketio = SocketIO(self.app) # Initialize SocketIO with the Flask app\n # self.server_thread = None # Thread for running the server\n # self.server_running = False # Flag to track server state\n self.setup_routes()\n self.highlight_word = None # Initialize the highlight word\n \n @self.app.route('/shutdown', methods=['POST'])\n def shutdown():\n self.logger.info(\"Shutdown request received\")\n self.shutdown_server()\n return 'Server shutting down...'\n \n # Setup logging\n self.logger = logging.getLogger('ProxyServer')\n self.logger.setLevel(logging.DEBUG) # Set the logging level (DEBUG, INFO, WARNING, ERROR)\n handler = logging.StreamHandler() # Log to standard output (can be customized to log to a file)\n handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))\n self.logger.addHandler(handler)\n\n self.logger.info(\"Proxy server initialized\")\n\n # Inject JavaScript into HTML content to highlight words and listen for WebSocket updates\n def inject_script(self, content):\n # Inject the WebSocket listening script\n script = f\"\"\"\n <script src=\"https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js\"></script>\n <script>\n let currentHighlight = \"{self.highlight_word}\";\n function highlightWord(word) {{\n if (word) {{\n document.body.innerHTML = document.body.innerHTML.replace(\n new RegExp(word, 'g'),\n '<span style=\"background-color: yellow;\">' + word + '</span>'\n );\n }}\n }}\n highlightWord(currentHighlight);\n\n // Connect to WebSocket\n const socket = io();\n socket.on('new_highlight', function(data) {{\n currentHighlight = data.highlight;\n highlightWord(currentHighlight);\n }});\n </script>\n \"\"\"\n return re.sub(r'</body>', script + '</body>', content)\n\n # Ensure the target_url and path are handled correctly\n def build_full_url(self, target_url, path):\n if not target_url.endswith('/') and not path.startswith('/'):\n return f\"{target_url}/{path}\"\n return f\"{target_url}{path}\"\n\n # Route handler for proxying requests\n def proxy(self, path=''):\n target_url = request.args.get('target_url')\n if not target_url and 'target_url' in session:\n target_url = session['target_url']\n elif target_url:\n session['target_url'] = target_url\n\n if not target_url:\n self.logger.error(\"No target_url provided\")\n return \"Error: target_url query parameter is required\", 400\n\n full_target_url = self.build_full_url(target_url, path)\n self.logger.info(f\"Proxying request to {full_target_url}\")\n\n headers = {key: value for key, value in request.headers if key != 'Host'}\n\n # Handle POST or GET requests\n if request.method == 'POST':\n response = requests.post(full_target_url, headers=headers, data=request.get_data(), stream=True)\n else:\n response = requests.get(full_target_url, headers=headers, stream=True)\n\n # If it's HTML content, inject the script\n if 'text/html' in response.headers.get('Content-Type', ''):\n def generate():\n for chunk in response.iter_content(chunk_size=1024):\n if chunk:\n rewritten_chunk = self.inject_script(chunk.decode('utf-8'))\n yield rewritten_chunk.encode('utf-8')\n self.logger.info(f\"Injecting script into HTML response from {full_target_url}\")\n return Response(generate(), content_type=response.headers['Content-Type'])\n\n # Stream non-HTML content (images, scripts, etc.)\n else:\n def generate():\n for chunk in response.iter_content(chunk_size=1024):\n if chunk:\n yield chunk\n return Response(generate(), content_type=response.headers['Content-Type'])\n\n # API endpoint to set a new highlight word\n def set_highlight(self):\n new_highlight = request.json.get('highlight')\n if new_highlight:\n self.highlight_word = new_highlight\n # Emit the new highlight word to all connected clients\n self.socketio.emit('new_highlight', {'highlight': new_highlight})\n self.logger.info(f\"Highlight word updated to '{new_highlight}' and broadcasted to clients\")\n return jsonify({\"message\": \"Highlight word updated\", \"highlight\": new_highlight}), 200\n self.logger.error(\"No highlight word provided\")\n return jsonify({\"error\": \"No highlight word provided\"}), 400\n\n # Setup routes to proxy all requests and WebSocket events\n def setup_routes(self):\n self.app.add_url_rule('/', defaults={'path': ''}, view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n self.app.add_url_rule('/<path:path>', view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n self.app.add_url_rule('/set_highlight', view_func=self.set_highlight, methods=['POST'])\n\n def run(self):\n \"\"\"Runs the Werkzeug server\"\"\"\n logging.info(f\"Starting server on {self.host}:{self.port}\")\n self.server = make_server(self.host, self.port, self.app, threaded=True)\n self.is_running = True\n self.server.serve_forever()\n\n def shutdown_server(self):\n \"\"\"Shuts down the Werkzeug server\"\"\"\n if self.server:\n self.logger.info(\"Shutting down server...\")\n self.is_running = False\n self.server.shutdown()\n self.logger.info(\"Server shut down complete\")\n\n# Create an instance of ProxyServer and expose the app\n# proxy_server = ProxyServer(secret_key='your_secret_key_here')\n# app = proxy_server.app # Expose the Flask app to the top-level scope for Flask CLI\n\n# if __name__ == '__main__':\n# proxy_server.run(port=5000)\n", | |
| "rag_system.py": "import openai\nimport faiss\nimport numpy as np\nimport sys\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom chat_demo import ChatDemo\nfrom web_crawler import WebCrawler \n\nclass RAGSystem:\n def __init__(self, model_name=\"text-embedding-ada-002\"):\n self.client = openai.OpenAI()\n self.model_name = model_name\n self.index = None\n self.faiss_data = []\n\n def split_into_chunks(self, page_data, max_chunk_size=500):\n chunks = []\n for page in page_data:\n url = page['url']\n for paragraph in page['paragraphs']:\n if len(paragraph) <= max_chunk_size:\n chunks.append({'content': paragraph, 'url': url})\n else:\n # Break long paragraphs into smaller chunks\n for i in range(0, len(paragraph), max_chunk_size):\n chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})\n return chunks\n\n def compute_embeddings(self, text_chunks):\n texts = [chunk['content'] for chunk in text_chunks]\n response = self.client.embeddings.create(model=self.model_name, input=texts)\n embeddings = [result.embedding for result in response.data]\n return embeddings\n\n def store_embeddings_in_faiss(self, embeddings, text_chunks):\n # Initialize a FAISS index\n dimension = len(embeddings[0])\n self.index = faiss.IndexFlatL2(dimension)\n \n for idx, embedding in enumerate(embeddings):\n np_embedding = np.array(embedding, dtype='float32')\n self.index.add(np_embedding.reshape(1, -1))\n self.faiss_data.append({\n 'embedding': np_embedding,\n 'content': text_chunks[idx]['content'],\n 'url': text_chunks[idx]['url']\n })\n\n def process_content(self, website_data):\n # Split data into chunks\n text_chunks = self.split_into_chunks(website_data)\n \n # Compute embeddings and create vector database\n embeddings = self.compute_embeddings(text_chunks)\n self.store_embeddings_in_faiss(embeddings, text_chunks)\n\n def process_user_query(self, query):\n # Compute the embedding of the query\n response = self.client.embeddings.create(model=self.model_name, input=[query])\n query_embedding = response.data[0].embedding\n np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)\n\n # Find the most similar embeddings\n distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results\n similar_chunks = [self.faiss_data[i] for i in indices[0]]\n\n return similar_chunks\n\n# Main function\nif __name__ == \"__main__\":\n if len(sys.argv) != 2:\n print(\"Usage: python rag_manager.py <url>\")\n sys.exit(1)\n\n url = sys.argv[1]\n # Crawl the website\n crawler = WebCrawler()\n website_data = crawler.crawl(url, 2)\n print(\"website data\", website_data)\n\n # Initialize RAGManager\n rag_system = RAGSystem()\n\n # Process content from page data\n rag_system.process_content(website_data)\n\n # Initialize ChatDemo with RAGManager\n chat_demo = ChatDemo(rag_system)\n\n # Command-line interaction loop for user queries\n while True:\n user_query = input(\"You: \")\n if user_query.lower() in [\"exit\", \"quit\"]:\n print(\"Goodbye!\")\n break\n response = chat_demo.chatbot(user_query)\n print(f\"Assistant: {response}\")", | |
| "web_crawler.py": "import requests\nfrom bs4 import BeautifulSoup\nfrom urllib.parse import urljoin, urlparse\nimport json\nimport re\nimport time\nimport sys\n\nclass WebCrawler:\n def __init__(self, max_depth=2):\n self.base_url = None\n self.visited = set()\n self.max_depth = max_depth\n self.data = []\n self.session = requests.Session()\n self.delay = 0.1 # Delay between requests to prevent overwhelming the server\n\n def can_crawl(self, url):\n \"\"\"Check robots.txt to see if crawling is allowed.\"\"\"\n parsed_url = urlparse(url)\n robots_url = urljoin(f\"{parsed_url.scheme}://{parsed_url.netloc}\", '/robots.txt')\n try:\n response = self.session.get(robots_url, timeout=10)\n if response.status_code == 200:\n disallowed_paths = re.findall(r'Disallow: (.+)', response.text)\n for path in disallowed_paths:\n if url.startswith(urljoin(self.base_url, path.strip())):\n return False\n except requests.RequestException:\n pass\n return True\n\n def fetch(self, url):\n \"\"\"Fetch the content of a URL.\"\"\"\n try:\n response = self.session.get(url, timeout=10)\n response.raise_for_status()\n return response.text\n except requests.RequestException as e:\n print(f\"Error fetching {url}: {e}\")\n return None\n\n def parse(self, html_content, url):\n \"\"\"Parse the HTML content and extract headings, paragraphs, and links.\"\"\"\n soup = BeautifulSoup(html_content, 'html.parser')\n page_data = {\n 'url': url,\n 'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],\n 'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],\n }\n self.data.append(page_data)\n return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]\n\n def crawl(self, url, depth):\n \"\"\"Crawl a given URL up to a certain depth.\"\"\"\n if depth > self.max_depth or url in self.visited or not self.can_crawl(url):\n return\n\n print(f\"Crawling: {url} at depth {depth}\")\n self.base_url = url\n self.visited.add(url)\n html_content = self.fetch(url)\n if html_content:\n links = self.parse(html_content, url)\n for link in links:\n if link.startswith(self.base_url): # Stay within the same domain\n time.sleep(self.delay) # Respectful crawling\n self.crawl(link, depth + 1)\n return self.get_data()\n\n def get_data(self):\n \"\"\"Return the crawled data.\"\"\"\n return self.data\n\nif __name__ == \"__main__\":\n if len(sys.argv) != 2:\n print(\"Usage: python web_crawler.py <URL>\")\n sys.exit(1)\n\n base_url = sys.argv[1]\n crawler = WebCrawler(max_depth=2)\n data = crawler.crawl(base_url, 0)\n print(json.dumps(data, indent=4))\n", | |
| ".gitignore": "__pycache__\n", | |
| "chatbot_ui.py": "import gradio as gr\nfrom web_crawler import WebCrawler\nfrom rag_system import RAGSystem\nfrom chat_demo import ChatDemo\nimport validators\n\n# Instantiate your existing classes\nrag_system = RAGSystem()\nchat_demo = ChatDemo(rag_system)\n\n# Function to handle the initial URL submission\ndef process_url(url):\n if not validators.url(url):\n return \"Invalid URL. Please enter a valid URL.\", None\n \n try:\n # Display loading message\n status_message = \"Crawling website and processing data...\"\n \n # Instantiate WebCrawler with the provided URL\n web_crawler = WebCrawler()\n \n # Crawl the website\n extracted_content = web_crawler.crawl(url, 0)\n \n # Process the data with the RAG system\n rag_system.process_content(extracted_content)\n\n # Display the website content in an iframe\n iframe_html = f'<iframe src=\"{url}\" width=\"100%\" height=\"500px\"></iframe>'\n \n return \"Website content successfully crawled and processed!\", [], iframe_html\n except Exception as e:\n return f\"Error: {str(e)}\", []\n\n# Function to handle chatbot interactions\ndef chatbot_response(user_input, chat_history):\n try:\n # Use the ChatDemo class to generate a response\n response = chat_demo.chatbot(user_input)\n chat_history.append([\"User\", user_input])\n chat_history.append([\"Chatbot\", response])\n \n # Update the chat history and return\n return chat_history, chat_history\n except Exception as e:\n return [[\"Error\", str(e)]], chat_history\n\n# Function to reset the application\ndef reset_app():\n return \"\", [], \"\", \"\"\n\n# Build the Gradio interface\nwith gr.Blocks() as demo:\n url_input = gr.Textbox(placeholder=\"Enter a website URL\", label=\"Website URL\", interactive=True)\n submit_button = gr.Button(\"Submit URL\")\n status_message = gr.Textbox(label=\"Status\", interactive=False)\n chat_history = gr.State(value=[])\n chatbot_output = gr.Chatbot(label=\"Chat History\")\n user_input = gr.Textbox(placeholder=\"Ask the chatbot...\", label=\"User Input\", interactive=True)\n embedded_view = gr.HTML(label=\"Website View\")\n reset_button = gr.Button(\"Change Website\")\n\n # Initial URL submission\n submit_button.click(process_url, inputs=url_input, outputs=[status_message, chat_history, embedded_view], queue=True)\n\n # Handle chatbot responses\n user_input.submit(chatbot_response, inputs=[user_input, chat_history], outputs=[chatbot_output, chat_history])\n\n # Handle reset button click\n reset_button.click(reset_app, outputs=[url_input, chat_history, status_message, embedded_view])\n\n# Launch the app\ndemo.launch()", | |
| ".flaskenv": "FLASK_APP=proxy_server.py\nFLASK_ENV=development\nFLASK_DEBUG=1\n", | |
| "proxy_thread.py": "import threading\nimport requests\nimport logging\nimport time\nfrom proxy_server import ProxyServer\n\nlogging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')\nlogger = logging.getLogger(__name__)\n\nclass ProxyThread(threading.Thread):\n def __init__(self, host='localhost', port=5000):\n threading.Thread.__init__(self)\n secret_key='your_secret_key_here'\n self.proxy_server = ProxyServer(secret_key, host, port)\n\n def run(self):\n \"\"\"Starts the proxy server in a thread\"\"\"\n logger.info(\"ProxyThread starting\")\n self.proxy_server.run()\n\n def stop(self):\n \"\"\"Stops the proxy server by sending a shutdown request\"\"\"\n logger.info(\"Stopping ProxyThread\")\n try:\n response = requests.post(f'http://{self.proxy_server.host}:{self.proxy_server.port}/shutdown', timeout=5)\n logger.info(f\"Shutdown request sent. Response: {response.text}\")\n except requests.RequestException as e:\n logger.error(f\"Error sending shutdown request: {e}\")\n\n # Wait for the server to shut down\n timeout = 10\n start_time = time.time()\n while self.proxy_server.is_running and time.time() - start_time < timeout:\n time.sleep(0.1)\n\n if self.proxy_server.is_running:\n logger.warning(\"Server did not shut down in time\")\n else:\n logger.info(\"Server has been shut down\")\n\n self.join(timeout=5)\n logger.info(\"ProxyThread stopped\")\n\n" | |
| } |