Spaces:

mmcc007
/

lazzloe.com

Sleeping

App Files Files Community

mmcc007 commited on Oct 8, 2024

Commit

748113b

verified ·

1 Parent(s): c62c5aa

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.env +1 -0
.flaskenv +3 -0
.gitignore +1 -0
README.md +2 -8
chat_demo.py +56 -0
chatbot_ui.py +184 -0
project_snapshot_no_images.json +10 -0
proxy_server.py +146 -0
proxy_thread.py +43 -0
rag_system.py +72 -0
requirements.txt +11 -0
web_crawler.py +92 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=sk-proj-8BdwEtUg42Q651Jy0GPFT3BlbkFJLmIfOiZ3cXLaD5NyHqAF

.flaskenv ADDED Viewed

	@@ -0,0 +1,3 @@

+FLASK_APP=proxy_server.py
+FLASK_ENV=development
+FLASK_DEBUG=1

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Lazzloe.com
-emoji: ⚡
-colorFrom: gray
-colorTo: pink
 sdk: gradio
 sdk_version: 4.44.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: lazzloe.com
+app_file: chatbot_ui.py
 sdk: gradio
 sdk_version: 4.44.1
 ---

chat_demo.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import openai
+import logging
+# Configure logging
+logger = logging.getLogger(__name__)
+class ChatDemo:
+    def __init__(self, rag_system):
+        self.client = openai.OpenAI()
+        self.rag_system = rag_system
+        self.last_context_strings = []  # Attribute to store content strings
+        logger.info("ChatDemo initialized with RAG system")
+    def chatbot(self, user_input):
+        logger.info(f"Received user input: {user_input}")
+        try:
+            similar_chunks = self.rag_system.process_user_query(user_input)
+            logger.debug(f"Retrieved {len(similar_chunks)} similar chunks from RAG system")
+            # Extract and store content strings
+            self.last_context_strings = [chunk['content'] for chunk in similar_chunks]
+            # Log each content string
+            logger.debug(f"Logging {len(self.last_context_strings)} context strings:")
+            for i, content in enumerate(self.last_context_strings):
+                logger.debug(f"Context string {i + 1}: {content[:100]}...") # Log first 100 chars to avoid extremely long log lines
+            context = "\n".join(self.last_context_strings)
+            messages = [
+                {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
+                {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_input}"}
+            ]
+            logger.debug("Sending request to OpenAI API")
+            response = self.client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=messages
+            )
+            answer = response.choices[0].message.content.strip()
+            logger.info("Generated response from OpenAI API")
+            return answer
+        except Exception as e:
+            logger.error(f"Error in chatbot method: {str(e)}", exc_info=True)
+            return f"An error occurred: {str(e)}"
+    def get_last_context_strings(self):
+        """
+        Retrieve the list of content strings from the last chatbot query.
+        Returns:
+            list: A list of strings, each representing the content of a chunk used in the last context.
+        """
+        logger.info(f"Retrieving last context strings (count: {len(self.last_context_strings)})")
+        return self.last_context_strings

chatbot_ui.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import gradio as gr
+from web_crawler import WebCrawler
+from rag_system import RAGSystem
+from chat_demo import ChatDemo
+import validators
+import logging
+from proxy_thread import ProxyThread
+import requests
+# Configure logging
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Instantiate your existing classes
+rag_system = RAGSystem()
+chat_demo = ChatDemo(rag_system)
+# Global variables to track the URL and server state
+url = ""
+proxy_thread = None
+def start_proxy_server():
+    global proxy_thread
+    # If the proxy server is already running, stop it first
+    if proxy_thread and proxy_thread.is_alive():
+        logger.info("Stopping the existing ProxyThread...")
+        proxy_thread.stop()
+    # Start a new proxy server
+    proxy_thread = ProxyThread(host='localhost', port=5000)
+    proxy_thread.start()
+    logger.info("New ProxyThread started.")
+def load_website(input_url):
+    global url
+    url = input_url  # Update the global url variable
+    logger.info(f"Loading website: {url}")
+    # Restart the ProxyServer with the new URL
+    start_proxy_server()
+    # Proxy server expects a specific URL format with target_url
+    proxied_url = f"http://127.0.0.1:5000/?target_url={url}"
+    iframe_html = f"""
+    <iframe src="{proxied_url}" width="100%" height="600px"></iframe>
+    """
+    return iframe_html
+def highlight_text(text):
+    if not url:
+        return "No website loaded."
+    logger.info(f"Highlighting text: {text}")
+    try:
+        response = requests.post(f'http://127.0.0.1:5000/set_highlight', json={"highlight": text})
+        if response.status_code == 200:
+            return "Highlight applied."
+        else:
+            return "Failed to apply highlight."
+    except Exception as e:
+        logger.error(f"Error highlighting text: {e}")
+        return f"Error: {e}"
+def clear_highlights():
+    if not url:
+        return "No website loaded."
+    logger.info("Clearing highlights and reloading the website.")
+    start_proxy_server()  # Restart the proxy server to clear highlights
+    return load_website(url)  # Reload the current website without highlights
+# Function to handle the initial URL submission
+def process_url(input_url):
+    logger.info(f"Processing URL: {input_url}")
+    if not validators.url(input_url):
+        logger.error(f"Invalid URL submitted: {input_url}")
+        return "Invalid URL. Please enter a valid URL.", None
+    try:
+        # Display loading message
+        status_message = "Crawling website and processing data..."
+        logger.info(status_message)
+        # Instantiate WebCrawler with the provided URL
+        web_crawler = WebCrawler()
+        # Crawl the website
+        logger.info(f"Starting web crawl for {input_url}")
+        extracted_content = web_crawler.crawl(input_url, 0)
+        logger.info(f"Web crawl completed for {input_url}")
+        # Process the data with the RAG system
+        logger.info("Processing extracted content with RAG system")
+        rag_system.process_content(extracted_content)
+        # Load the website through the proxy
+        iframe_html = load_website(input_url)
+        logger.info("URL processing completed successfully")
+        return "Website content successfully crawled and processed!", [], iframe_html
+    except Exception as e:
+        logger.error(f"Error processing URL {input_url}: {str(e)}", exc_info=True)
+        return f"Error: {str(e)}", []
+# Function to handle chatbot interactions
+def chatbot_response(user_input, chat_history):
+    logger.info(f"Received user input: {user_input}")
+    try:
+        # Use the ChatDemo class to generate a response
+        logger.info("Generating chatbot response")
+        response = chat_demo.chatbot(user_input)
+        chat_history.append(["User", user_input])
+        chat_history.append(["Chatbot", response])
+        logger.info("Chatbot response generated successfully")
+        # Get the context strings used for the response
+        context_strings = chat_demo.get_last_context_strings()
+        logger.info(f"Retrieved {len(context_strings)} context strings")
+        # Highlight each context string individually
+        for i, context in enumerate(context_strings, 1):
+            highlight_result = highlight_text(context)
+            logger.info(f"Highlight result for context {i}: {highlight_result}")
+        # Prepare status message
+        highlight_status = f"Highlighted {len(context_strings)} context passages"
+        logger.info(highlight_status)
+        # Update the chat history and return
+        return chat_history, chat_history, highlight_status
+    except Exception as e:
+        logger.error(f"Error in chatbot_response: {str(e)}", exc_info=True)
+        return [[f"Error: {str(e)}"], chat_history], chat_history, f"Error: {str(e)}"
+# Function to reset the application
+def reset_app():
+    global url
+    url = ""  # Clear the global URL
+    logger.info("Resetting application and proxy server")
+    start_proxy_server()  # Restart the proxy server for a fresh session
+    return "", [], "", ""
+# Build the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Website Concierge")
+    with gr.Row():
+        with gr.Column(scale=1):
+            url_input = gr.Textbox(placeholder="Enter a website URL", label="Website URL", interactive=True)
+            submit_button = gr.Button("Submit URL")
+            status_message = gr.Textbox(label="Status", interactive=False)
+            chat_history = gr.State(value=[])
+            chatbot_output = gr.Chatbot(label="Chat History")
+            user_input = gr.Textbox(placeholder="Ask the chatbot...", label="User Input", interactive=True)
+            highlight_status = gr.Textbox(label="Highlight Status", interactive=False)
+            clear_button = gr.Button("Clear Highlights")
+            reset_button = gr.Button("Change Website")
+        with gr.Column(scale=1):
+            proxied_view = gr.HTML(label="Website View")
+    # Initial URL submission
+    submit_button.click(process_url, inputs=url_input, outputs=[status_message, chat_history, proxied_view], queue=True)
+    # Handle chatbot responses
+    user_input.submit(chatbot_response, inputs=[user_input, chat_history], outputs=[chatbot_output, chat_history, highlight_status])
+    # Handle clearing highlights
+    clear_button.click(clear_highlights, outputs=[proxied_view])
+    # Handle reset button click
+    reset_button.click(reset_app, outputs=[url_input, chat_history, status_message, proxied_view])
+# Launch the app
+if __name__ == "__main__":
+    logger.info("Starting Gradio application")
+    start_proxy_server()  # Start with an initial ProxyServer
+    demo.launch()
+    logger.info("Gradio application stopped")

project_snapshot_no_images.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    ".env": "OPENAI_API_KEY=sk-proj-8BdwEtUg42Q651Jy0GPFT3BlbkFJLmIfOiZ3cXLaD5NyHqAF\n",
+    "proxy_server.py": "import time\nfrom flask import Flask, request, Response, session, jsonify\nfrom flask_socketio import SocketIO, emit\nfrom werkzeug.serving import make_server\nimport requests\nimport re\nimport logging\nimport threading\n\nclass ProxyServer:\n    def __init__(self, secret_key, host='localhost', port=5000):\n        self.host = host\n        self.port = port\n        self.app = Flask(__name__)\n        self.app.secret_key = secret_key\n        self.server = None\n        self.is_running = False\n        self.socketio = SocketIO(self.app)  # Initialize SocketIO with the Flask app\n        # self.server_thread = None  # Thread for running the server\n        # self.server_running = False  # Flag to track server state\n        self.setup_routes()\n        self.highlight_word = None  # Initialize the highlight word\n        \n        @self.app.route('/shutdown', methods=['POST'])\n        def shutdown():\n            self.logger.info(\"Shutdown request received\")\n            self.shutdown_server()\n            return 'Server shutting down...'\n        \n        # Setup logging\n        self.logger = logging.getLogger('ProxyServer')\n        self.logger.setLevel(logging.DEBUG)  # Set the logging level (DEBUG, INFO, WARNING, ERROR)\n        handler = logging.StreamHandler()  # Log to standard output (can be customized to log to a file)\n        handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))\n        self.logger.addHandler(handler)\n\n        self.logger.info(\"Proxy server initialized\")\n\n    # Inject JavaScript into HTML content to highlight words and listen for WebSocket updates\n    def inject_script(self, content):\n        # Inject the WebSocket listening script\n        script = f\"\"\"\n        <script src=\"https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js\"></script>\n        <script>\n            let currentHighlight = \"{self.highlight_word}\";\n            function highlightWord(word) {{\n                if (word) {{\n                    document.body.innerHTML = document.body.innerHTML.replace(\n                        new RegExp(word, 'g'),\n                        '<span style=\"background-color: yellow;\">' + word + '</span>'\n                    );\n                }}\n            }}\n            highlightWord(currentHighlight);\n\n            // Connect to WebSocket\n            const socket = io();\n            socket.on('new_highlight', function(data) {{\n                currentHighlight = data.highlight;\n                highlightWord(currentHighlight);\n            }});\n        </script>\n        \"\"\"\n        return re.sub(r'</body>', script + '</body>', content)\n\n    # Ensure the target_url and path are handled correctly\n    def build_full_url(self, target_url, path):\n        if not target_url.endswith('/') and not path.startswith('/'):\n            return f\"{target_url}/{path}\"\n        return f\"{target_url}{path}\"\n\n    # Route handler for proxying requests\n    def proxy(self, path=''):\n        target_url = request.args.get('target_url')\n        if not target_url and 'target_url' in session:\n            target_url = session['target_url']\n        elif target_url:\n            session['target_url'] = target_url\n\n        if not target_url:\n            self.logger.error(\"No target_url provided\")\n            return \"Error: target_url query parameter is required\", 400\n\n        full_target_url = self.build_full_url(target_url, path)\n        self.logger.info(f\"Proxying request to {full_target_url}\")\n\n        headers = {key: value for key, value in request.headers if key != 'Host'}\n\n        # Handle POST or GET requests\n        if request.method == 'POST':\n            response = requests.post(full_target_url, headers=headers, data=request.get_data(), stream=True)\n        else:\n            response = requests.get(full_target_url, headers=headers, stream=True)\n\n        # If it's HTML content, inject the script\n        if 'text/html' in response.headers.get('Content-Type', ''):\n            def generate():\n                for chunk in response.iter_content(chunk_size=1024):\n                    if chunk:\n                        rewritten_chunk = self.inject_script(chunk.decode('utf-8'))\n                        yield rewritten_chunk.encode('utf-8')\n            self.logger.info(f\"Injecting script into HTML response from {full_target_url}\")\n            return Response(generate(), content_type=response.headers['Content-Type'])\n\n        # Stream non-HTML content (images, scripts, etc.)\n        else:\n            def generate():\n                for chunk in response.iter_content(chunk_size=1024):\n                    if chunk:\n                        yield chunk\n            return Response(generate(), content_type=response.headers['Content-Type'])\n\n    # API endpoint to set a new highlight word\n    def set_highlight(self):\n        new_highlight = request.json.get('highlight')\n        if new_highlight:\n            self.highlight_word = new_highlight\n            # Emit the new highlight word to all connected clients\n            self.socketio.emit('new_highlight', {'highlight': new_highlight})\n            self.logger.info(f\"Highlight word updated to '{new_highlight}' and broadcasted to clients\")\n            return jsonify({\"message\": \"Highlight word updated\", \"highlight\": new_highlight}), 200\n        self.logger.error(\"No highlight word provided\")\n        return jsonify({\"error\": \"No highlight word provided\"}), 400\n\n    # Setup routes to proxy all requests and WebSocket events\n    def setup_routes(self):\n        self.app.add_url_rule('/', defaults={'path': ''}, view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n        self.app.add_url_rule('/<path:path>', view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n        self.app.add_url_rule('/set_highlight', view_func=self.set_highlight, methods=['POST'])\n\n    def run(self):\n        \"\"\"Runs the Werkzeug server\"\"\"\n        logging.info(f\"Starting server on {self.host}:{self.port}\")\n        self.server = make_server(self.host, self.port, self.app, threaded=True)\n        self.is_running = True\n        self.server.serve_forever()\n\n    def shutdown_server(self):\n        \"\"\"Shuts down the Werkzeug server\"\"\"\n        if self.server:\n            self.logger.info(\"Shutting down server...\")\n            self.is_running = False\n            self.server.shutdown()\n            self.logger.info(\"Server shut down complete\")\n\n# Create an instance of ProxyServer and expose the app\n# proxy_server = ProxyServer(secret_key='your_secret_key_here')\n# app = proxy_server.app  # Expose the Flask app to the top-level scope for Flask CLI\n\n# if __name__ == '__main__':\n#     proxy_server.run(port=5000)\n",
+    "rag_system.py": "import openai\nimport faiss\nimport numpy as np\nimport sys\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom chat_demo import ChatDemo\nfrom web_crawler import WebCrawler  \n\nclass RAGSystem:\n    def __init__(self, model_name=\"text-embedding-ada-002\"):\n        self.client = openai.OpenAI()\n        self.model_name = model_name\n        self.index = None\n        self.faiss_data = []\n\n    def split_into_chunks(self, page_data, max_chunk_size=500):\n        chunks = []\n        for page in page_data:\n            url = page['url']\n            for paragraph in page['paragraphs']:\n                if len(paragraph) <= max_chunk_size:\n                    chunks.append({'content': paragraph, 'url': url})\n                else:\n                    # Break long paragraphs into smaller chunks\n                    for i in range(0, len(paragraph), max_chunk_size):\n                        chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})\n        return chunks\n\n    def compute_embeddings(self, text_chunks):\n        texts = [chunk['content'] for chunk in text_chunks]\n        response = self.client.embeddings.create(model=self.model_name, input=texts)\n        embeddings = [result.embedding for result in response.data]\n        return embeddings\n\n    def store_embeddings_in_faiss(self, embeddings, text_chunks):\n        # Initialize a FAISS index\n        dimension = len(embeddings[0])\n        self.index = faiss.IndexFlatL2(dimension)\n        \n        for idx, embedding in enumerate(embeddings):\n            np_embedding = np.array(embedding, dtype='float32')\n            self.index.add(np_embedding.reshape(1, -1))\n            self.faiss_data.append({\n                'embedding': np_embedding,\n                'content': text_chunks[idx]['content'],\n                'url': text_chunks[idx]['url']\n            })\n\n    def process_content(self, website_data):\n        # Split data into chunks\n        text_chunks = self.split_into_chunks(website_data)\n        \n        # Compute embeddings and create vector database\n        embeddings = self.compute_embeddings(text_chunks)\n        self.store_embeddings_in_faiss(embeddings, text_chunks)\n\n    def process_user_query(self, query):\n        # Compute the embedding of the query\n        response = self.client.embeddings.create(model=self.model_name, input=[query])\n        query_embedding = response.data[0].embedding\n        np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)\n\n        # Find the most similar embeddings\n        distances, indices = self.index.search(np_query_embedding, 5)  # Retrieve top 5 similar results\n        similar_chunks = [self.faiss_data[i] for i in indices[0]]\n\n        return similar_chunks\n\n# Main function\nif __name__ == \"__main__\":\n    if len(sys.argv) != 2:\n        print(\"Usage: python rag_manager.py <url>\")\n        sys.exit(1)\n\n    url = sys.argv[1]\n    # Crawl the website\n    crawler = WebCrawler()\n    website_data = crawler.crawl(url, 2)\n    print(\"website data\", website_data)\n\n    # Initialize RAGManager\n    rag_system = RAGSystem()\n\n    # Process content from page data\n    rag_system.process_content(website_data)\n\n    # Initialize ChatDemo with RAGManager\n    chat_demo = ChatDemo(rag_system)\n\n    # Command-line interaction loop for user queries\n    while True:\n        user_query = input(\"You: \")\n        if user_query.lower() in [\"exit\", \"quit\"]:\n            print(\"Goodbye!\")\n            break\n        response = chat_demo.chatbot(user_query)\n        print(f\"Assistant: {response}\")",
+    "web_crawler.py": "import requests\nfrom bs4 import BeautifulSoup\nfrom urllib.parse import urljoin, urlparse\nimport json\nimport re\nimport time\nimport sys\n\nclass WebCrawler:\n    def __init__(self, max_depth=2):\n        self.base_url = None\n        self.visited = set()\n        self.max_depth = max_depth\n        self.data = []\n        self.session = requests.Session()\n        self.delay = 0.1  # Delay between requests to prevent overwhelming the server\n\n    def can_crawl(self, url):\n        \"\"\"Check robots.txt to see if crawling is allowed.\"\"\"\n        parsed_url = urlparse(url)\n        robots_url = urljoin(f\"{parsed_url.scheme}://{parsed_url.netloc}\", '/robots.txt')\n        try:\n            response = self.session.get(robots_url, timeout=10)\n            if response.status_code == 200:\n                disallowed_paths = re.findall(r'Disallow: (.+)', response.text)\n                for path in disallowed_paths:\n                    if url.startswith(urljoin(self.base_url, path.strip())):\n                        return False\n        except requests.RequestException:\n            pass\n        return True\n\n    def fetch(self, url):\n        \"\"\"Fetch the content of a URL.\"\"\"\n        try:\n            response = self.session.get(url, timeout=10)\n            response.raise_for_status()\n            return response.text\n        except requests.RequestException as e:\n            print(f\"Error fetching {url}: {e}\")\n            return None\n\n    def parse(self, html_content, url):\n        \"\"\"Parse the HTML content and extract headings, paragraphs, and links.\"\"\"\n        soup = BeautifulSoup(html_content, 'html.parser')\n        page_data = {\n            'url': url,\n            'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],\n            'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],\n        }\n        self.data.append(page_data)\n        return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]\n\n    def crawl(self, url, depth):\n        \"\"\"Crawl a given URL up to a certain depth.\"\"\"\n        if depth > self.max_depth or url in self.visited or not self.can_crawl(url):\n            return\n\n        print(f\"Crawling: {url} at depth {depth}\")\n        self.base_url = url\n        self.visited.add(url)\n        html_content = self.fetch(url)\n        if html_content:\n            links = self.parse(html_content, url)\n            for link in links:\n                if link.startswith(self.base_url):  # Stay within the same domain\n                    time.sleep(self.delay)  # Respectful crawling\n                    self.crawl(link, depth + 1)\n        return self.get_data()\n\n    def get_data(self):\n        \"\"\"Return the crawled data.\"\"\"\n        return self.data\n\nif __name__ == \"__main__\":\n    if len(sys.argv) != 2:\n        print(\"Usage: python web_crawler.py <URL>\")\n        sys.exit(1)\n\n    base_url = sys.argv[1]\n    crawler = WebCrawler(max_depth=2)\n    data = crawler.crawl(base_url, 0)\n    print(json.dumps(data, indent=4))\n",
+    ".gitignore": "__pycache__\n",
+    "chatbot_ui.py": "import gradio as gr\nfrom web_crawler import WebCrawler\nfrom rag_system import RAGSystem\nfrom chat_demo import ChatDemo\nimport validators\n\n# Instantiate your existing classes\nrag_system = RAGSystem()\nchat_demo = ChatDemo(rag_system)\n\n# Function to handle the initial URL submission\ndef process_url(url):\n    if not validators.url(url):\n        return \"Invalid URL. Please enter a valid URL.\", None\n    \n    try:\n        # Display loading message\n        status_message = \"Crawling website and processing data...\"\n        \n        # Instantiate WebCrawler with the provided URL\n        web_crawler = WebCrawler()\n        \n        # Crawl the website\n        extracted_content = web_crawler.crawl(url, 0)\n        \n        # Process the data with the RAG system\n        rag_system.process_content(extracted_content)\n\n        # Display the website content in an iframe\n        iframe_html = f'<iframe src=\"{url}\" width=\"100%\" height=\"500px\"></iframe>'\n        \n        return \"Website content successfully crawled and processed!\", [], iframe_html\n    except Exception as e:\n        return f\"Error: {str(e)}\", []\n\n# Function to handle chatbot interactions\ndef chatbot_response(user_input, chat_history):\n    try:\n        # Use the ChatDemo class to generate a response\n        response = chat_demo.chatbot(user_input)\n        chat_history.append([\"User\", user_input])\n        chat_history.append([\"Chatbot\", response])\n        \n        # Update the chat history and return\n        return chat_history, chat_history\n    except Exception as e:\n        return [[\"Error\", str(e)]], chat_history\n\n# Function to reset the application\ndef reset_app():\n    return \"\", [], \"\", \"\"\n\n# Build the Gradio interface\nwith gr.Blocks() as demo:\n    url_input = gr.Textbox(placeholder=\"Enter a website URL\", label=\"Website URL\", interactive=True)\n    submit_button = gr.Button(\"Submit URL\")\n    status_message = gr.Textbox(label=\"Status\", interactive=False)\n    chat_history = gr.State(value=[])\n    chatbot_output = gr.Chatbot(label=\"Chat History\")\n    user_input = gr.Textbox(placeholder=\"Ask the chatbot...\", label=\"User Input\", interactive=True)\n    embedded_view = gr.HTML(label=\"Website View\")\n    reset_button = gr.Button(\"Change Website\")\n\n    # Initial URL submission\n    submit_button.click(process_url, inputs=url_input, outputs=[status_message, chat_history, embedded_view], queue=True)\n\n    # Handle chatbot responses\n    user_input.submit(chatbot_response, inputs=[user_input, chat_history], outputs=[chatbot_output, chat_history])\n\n    # Handle reset button click\n    reset_button.click(reset_app, outputs=[url_input, chat_history, status_message, embedded_view])\n\n# Launch the app\ndemo.launch()",
+    ".flaskenv": "FLASK_APP=proxy_server.py\nFLASK_ENV=development\nFLASK_DEBUG=1\n",
+    "proxy_thread.py": "import threading\nimport requests\nimport logging\nimport time\nfrom proxy_server import ProxyServer\n\nlogging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')\nlogger = logging.getLogger(__name__)\n\nclass ProxyThread(threading.Thread):\n    def __init__(self, host='localhost', port=5000):\n        threading.Thread.__init__(self)\n        secret_key='your_secret_key_here'\n        self.proxy_server = ProxyServer(secret_key, host, port)\n\n    def run(self):\n        \"\"\"Starts the proxy server in a thread\"\"\"\n        logger.info(\"ProxyThread starting\")\n        self.proxy_server.run()\n\n    def stop(self):\n        \"\"\"Stops the proxy server by sending a shutdown request\"\"\"\n        logger.info(\"Stopping ProxyThread\")\n        try:\n            response = requests.post(f'http://{self.proxy_server.host}:{self.proxy_server.port}/shutdown', timeout=5)\n            logger.info(f\"Shutdown request sent. Response: {response.text}\")\n        except requests.RequestException as e:\n            logger.error(f\"Error sending shutdown request: {e}\")\n\n        # Wait for the server to shut down\n        timeout = 10\n        start_time = time.time()\n        while self.proxy_server.is_running and time.time() - start_time < timeout:\n            time.sleep(0.1)\n\n        if self.proxy_server.is_running:\n            logger.warning(\"Server did not shut down in time\")\n        else:\n            logger.info(\"Server has been shut down\")\n\n        self.join(timeout=5)\n        logger.info(\"ProxyThread stopped\")\n\n"
+}

proxy_server.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import time
+from flask import Flask, request, Response, session, jsonify
+from flask_socketio import SocketIO, emit
+from werkzeug.serving import make_server
+import requests
+import re
+import logging
+# Configure logging
+logger = logging.getLogger(__name__)
+class ProxyServer:
+    def __init__(self, secret_key, host='localhost', port=5000):
+        self.host = host
+        self.port = port
+        self.app = Flask(__name__)
+        self.app.secret_key = secret_key
+        self.server = None
+        self.is_running = False
+        self.socketio = SocketIO(self.app)  # Initialize SocketIO with the Flask app
+        # self.server_thread = None  # Thread for running the server
+        # self.server_running = False  # Flag to track server state
+        self.setup_routes()
+        self.highlight_word = None  # Initialize the highlight word
+        @self.app.route('/shutdown', methods=['POST'])
+        def shutdown():
+            logger.info("Shutdown request received")
+            self.shutdown_server()
+            return 'Server shutting down...'
+        logger.info("Proxy server initialized")
+    # Inject JavaScript into HTML content to highlight words and listen for WebSocket updates
+    def inject_script(self, content):
+        # Inject the WebSocket listening script
+        script = f"""
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js"></script>
+        <script>
+            let currentHighlight = "{self.highlight_word}";
+            function highlightWord(word) {{
+                if (word) {{
+                    document.body.innerHTML = document.body.innerHTML.replace(
+                        new RegExp(word, 'g'),
+                        '<span style="background-color: yellow;">' + word + '</span>'
+                    );
+                }}
+            }}
+            highlightWord(currentHighlight);
+            // Connect to WebSocket
+            const socket = io();
+            socket.on('new_highlight', function(data) {{
+                currentHighlight = data.highlight;
+                highlightWord(currentHighlight);
+            }});
+        </script>
+        """
+        return re.sub(r'</body>', script + '</body>', content)
+    # Ensure the target_url and path are handled correctly
+    def build_full_url(self, target_url, path):
+        if not target_url.endswith('/') and not path.startswith('/'):
+            return f"{target_url}/{path}"
+        return f"{target_url}{path}"
+    # Route handler for proxying requests
+    def proxy(self, path=''):
+        target_url = request.args.get('target_url')
+        if not target_url and 'target_url' in session:
+            target_url = session['target_url']
+        elif target_url:
+            session['target_url'] = target_url
+        if not target_url:
+            logger.error("No target_url provided")
+            return "Error: target_url query parameter is required", 400
+        full_target_url = self.build_full_url(target_url, path)
+        logger.info(f"Proxying request to {full_target_url}")
+        headers = {key: value for key, value in request.headers if key != 'Host'}
+        # Handle POST or GET requests
+        if request.method == 'POST':
+            response = requests.post(full_target_url, headers=headers, data=request.get_data(), stream=True)
+        else:
+            response = requests.get(full_target_url, headers=headers, stream=True)
+        # If it's HTML content, inject the script
+        if 'text/html' in response.headers.get('Content-Type', ''):
+            def generate():
+                for chunk in response.iter_content(chunk_size=1024):
+                    if chunk:
+                        rewritten_chunk = self.inject_script(chunk.decode('utf-8'))
+                        yield rewritten_chunk.encode('utf-8')
+            logger.info(f"Injecting script into HTML response from {full_target_url}")
+            return Response(generate(), content_type=response.headers['Content-Type'])
+        # Stream non-HTML content (images, scripts, etc.)
+        else:
+            def generate():
+                for chunk in response.iter_content(chunk_size=1024):
+                    if chunk:
+                        yield chunk
+            return Response(generate(), content_type=response.headers['Content-Type'])
+    # API endpoint to set a new highlight word
+    def set_highlight(self):
+        new_highlight = request.json.get('highlight')
+        if new_highlight:
+            self.highlight_word = new_highlight
+            # Emit the new highlight word to all connected clients
+            self.socketio.emit('new_highlight', {'highlight': new_highlight})
+            logger.info(f"Highlight word updated to '{new_highlight}' and broadcasted to clients")
+            return jsonify({"message": "Highlight word updated", "highlight": new_highlight}), 200
+        logger.error("No highlight word provided")
+        return jsonify({"error": "No highlight word provided"}), 400
+    # Setup routes to proxy all requests and WebSocket events
+    def setup_routes(self):
+        self.app.add_url_rule('/', defaults={'path': ''}, view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])
+        self.app.add_url_rule('/<path:path>', view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])
+        self.app.add_url_rule('/set_highlight', view_func=self.set_highlight, methods=['POST'])
+    def run(self):
+        """Runs the Werkzeug server"""
+        logging.info(f"Starting server on {self.host}:{self.port}")
+        self.server = make_server(self.host, self.port, self.app, threaded=True)
+        self.is_running = True
+        self.server.serve_forever()
+    def shutdown_server(self):
+        """Shuts down the Werkzeug server"""
+        if self.server:
+            logger.info("Shutting down server...")
+            self.is_running = False
+            self.server.shutdown()
+            logger.info("Server shut down complete")
+# Create an instance of ProxyServer and expose the app
+# proxy_server = ProxyServer(secret_key='your_secret_key_here')
+# app = proxy_server.app  # Expose the Flask app to the top-level scope for Flask CLI
+# if __name__ == '__main__':
+#     proxy_server.run(port=5000)

proxy_thread.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import threading
+import requests
+import logging
+import time
+from proxy_server import ProxyServer
+# Configure logging
+logger = logging.getLogger(__name__)
+class ProxyThread(threading.Thread):
+    def __init__(self, host='localhost', port=5000):
+        threading.Thread.__init__(self)
+        secret_key='your_secret_key_here'
+        self.proxy_server = ProxyServer(secret_key, host, port)
+    def run(self):
+        """Starts the proxy server in a thread"""
+        logger.info("ProxyThread starting")
+        self.proxy_server.run()
+    def stop(self):
+        """Stops the proxy server by sending a shutdown request"""
+        logger.info("Stopping ProxyThread")
+        try:
+            response = requests.post(f'http://{self.proxy_server.host}:{self.proxy_server.port}/shutdown', timeout=5)
+            logger.info(f"Shutdown request sent. Response: {response.text}")
+        except requests.RequestException as e:
+            logger.error(f"Error sending shutdown request: {e}")
+        # Wait for the server to shut down
+        timeout = 10
+        start_time = time.time()
+        while self.proxy_server.is_running and time.time() - start_time < timeout:
+            time.sleep(0.1)
+        if self.proxy_server.is_running:
+            logger.warning("Server did not shut down in time")
+        else:
+            logger.info("Server has been shut down")
+        self.join(timeout=5)
+        logger.info("ProxyThread stopped")

rag_system.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import openai
+import faiss
+import numpy as np
+import logging
+from sklearn.metrics.pairwise import cosine_similarity
+# Configure logging
+logger = logging.getLogger(__name__)
+class RAGSystem:
+    def __init__(self, model_name="text-embedding-ada-002"):
+        self.client = openai.OpenAI()
+        self.model_name = model_name
+        self.index = None
+        self.faiss_data = []
+        logger.info(f"RAGSystem initialized with model: {model_name}")
+    def split_into_chunks(self, page_data, max_chunk_size=500):
+        logger.info(f"Splitting data into chunks with max size: {max_chunk_size}")
+        chunks = []
+        for page in page_data:
+            url = page['url']
+            for paragraph in page['paragraphs']:
+                if len(paragraph) <= max_chunk_size:
+                    chunks.append({'content': paragraph, 'url': url})
+                else:
+                    # Break long paragraphs into smaller chunks
+                    for i in range(0, len(paragraph), max_chunk_size):
+                        chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})
+        logger.debug(f"Created {len(chunks)} chunks")
+        return chunks
+    def compute_embeddings(self, text_chunks):
+        logger.info(f"Computing embeddings for {len(text_chunks)} chunks")
+        texts = [chunk['content'] for chunk in text_chunks]
+        response = self.client.embeddings.create(model=self.model_name, input=texts)
+        embeddings = [result.embedding for result in response.data]
+        logger.debug(f"Computed {len(embeddings)} embeddings")
+        return embeddings
+    def store_embeddings_in_faiss(self, embeddings, text_chunks):
+        logger.info("Storing embeddings in FAISS index")
+        dimension = len(embeddings[0])
+        self.index = faiss.IndexFlatL2(dimension)
+        for idx, embedding in enumerate(embeddings):
+            np_embedding = np.array(embedding, dtype='float32')
+            self.index.add(np_embedding.reshape(1, -1))
+            self.faiss_data.append({
+                'embedding': np_embedding,
+                'content': text_chunks[idx]['content'],
+                'url': text_chunks[idx]['url']
+            })
+        logger.debug(f"Stored {len(embeddings)} embeddings in FAISS index")
+    def process_content(self, website_data):
+        logger.info("Processing website content")
+        text_chunks = self.split_into_chunks(website_data)
+        embeddings = self.compute_embeddings(text_chunks)
+        self.store_embeddings_in_faiss(embeddings, text_chunks)
+        logger.info("Content processing completed")
+    def process_user_query(self, query):
+        logger.info(f"Processing user query: {query}")
+        response = self.client.embeddings.create(model=self.model_name, input=[query])
+        query_embedding = response.data[0].embedding
+        np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)
+        distances, indices = self.index.search(np_query_embedding, 5)  # Retrieve top 5 similar results
+        similar_chunks = [self.faiss_data[i] for i in indices[0]]
+        logger.debug(f"Retrieved {len(similar_chunks)} similar chunks for the query")
+        return similar_chunks

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+beautifulsoup4==4.12.3
+faiss_cpu==1.8.0.post1
+Flask==3.0.3
+Flask_SocketIO==5.4.1
+gradio==4.44.1
+numpy==2.1.2
+openai==1.51.1
+Requests==2.32.3
+scikit_learn==1.5.2
+validators==0.28.1
+Werkzeug==3.0.4

web_crawler.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import json
+import re
+import time
+import sys
+import logging
+# Configure logging
+logger = logging.getLogger(__name__)
+class WebCrawler:
+    def __init__(self, max_depth=2):
+        self.base_url = None
+        self.visited = set()
+        self.max_depth = max_depth
+        self.data = []
+        self.session = requests.Session()
+        self.delay = 0.1  # Delay between requests to prevent overwhelming the server
+        logger.info(f"WebCrawler initialized with max_depth: {max_depth}")
+    def can_crawl(self, url):
+        logger.debug(f"Checking if can crawl: {url}")
+        parsed_url = urlparse(url)
+        robots_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", '/robots.txt')
+        try:
+            response = self.session.get(robots_url, timeout=10)
+            if response.status_code == 200:
+                disallowed_paths = re.findall(r'Disallow: (.+)', response.text)
+                for path in disallowed_paths:
+                    if url.startswith(urljoin(self.base_url, path.strip())):
+                        logger.info(f"Crawling not allowed for: {url}")
+                        return False
+        except requests.RequestException:
+            logger.warning(f"Error fetching robots.txt for {url}", exc_info=True)
+        logger.debug(f"Crawling allowed for: {url}")
+        return True
+    def fetch(self, url):
+        logger.info(f"Fetching content from: {url}")
+        try:
+            response = self.session.get(url, timeout=10)
+            response.raise_for_status()
+            logger.debug(f"Successfully fetched content from: {url}")
+            return response.text
+        except requests.RequestException as e:
+            logger.error(f"Error fetching {url}: {e}", exc_info=True)
+            return None
+    def parse(self, html_content, url):
+        logger.info(f"Parsing HTML content from: {url}")
+        soup = BeautifulSoup(html_content, 'html.parser')
+        page_data = {
+            'url': url,
+            'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],
+            'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],
+        }
+        self.data.append(page_data)
+        links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
+        logger.debug(f"Parsed {len(links)} links from {url}")
+        return links
+    def crawl(self, url, depth):
+        if depth > self.max_depth or url in self.visited or not self.can_crawl(url):
+            return
+        logger.info(f"Crawling: {url} at depth {depth}")
+        self.base_url = url
+        self.visited.add(url)
+        html_content = self.fetch(url)
+        if html_content:
+            links = self.parse(html_content, url)
+            for link in links:
+                if link.startswith(self.base_url):  # Stay within the same domain
+                    time.sleep(self.delay)  # Respectful crawling
+                    self.crawl(link, depth + 1)
+        return self.get_data()
+    def get_data(self):
+        logger.info(f"Returning crawled data: {len(self.data)} pages")
+        return self.data
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python web_crawler.py <URL>")
+        sys.exit(1)
+    base_url = sys.argv[1]
+    crawler = WebCrawler(max_depth=2)
+    data = crawler.crawl(base_url, 0)
+    print(json.dumps(data, indent=4))