Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .env +1 -0
- .flaskenv +3 -0
- .gitignore +1 -0
- README.md +2 -8
- chat_demo.py +56 -0
- chatbot_ui.py +184 -0
- project_snapshot_no_images.json +10 -0
- proxy_server.py +146 -0
- proxy_thread.py +43 -0
- rag_system.py +72 -0
- requirements.txt +11 -0
- web_crawler.py +92 -0
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
OPENAI_API_KEY=sk-proj-8BdwEtUg42Q651Jy0GPFT3BlbkFJLmIfOiZ3cXLaD5NyHqAF
|
.flaskenv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FLASK_APP=proxy_server.py
|
| 2 |
+
FLASK_ENV=development
|
| 3 |
+
FLASK_DEBUG=1
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 4.44.1
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: lazzloe.com
|
| 3 |
+
app_file: chatbot_ui.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
sdk_version: 4.44.1
|
|
|
|
|
|
|
| 6 |
---
|
|
|
|
|
|
chat_demo.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
# Configure logging
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class ChatDemo:
|
| 8 |
+
def __init__(self, rag_system):
|
| 9 |
+
self.client = openai.OpenAI()
|
| 10 |
+
self.rag_system = rag_system
|
| 11 |
+
self.last_context_strings = [] # Attribute to store content strings
|
| 12 |
+
logger.info("ChatDemo initialized with RAG system")
|
| 13 |
+
|
| 14 |
+
def chatbot(self, user_input):
|
| 15 |
+
logger.info(f"Received user input: {user_input}")
|
| 16 |
+
try:
|
| 17 |
+
similar_chunks = self.rag_system.process_user_query(user_input)
|
| 18 |
+
logger.debug(f"Retrieved {len(similar_chunks)} similar chunks from RAG system")
|
| 19 |
+
|
| 20 |
+
# Extract and store content strings
|
| 21 |
+
self.last_context_strings = [chunk['content'] for chunk in similar_chunks]
|
| 22 |
+
|
| 23 |
+
# Log each content string
|
| 24 |
+
logger.debug(f"Logging {len(self.last_context_strings)} context strings:")
|
| 25 |
+
for i, content in enumerate(self.last_context_strings):
|
| 26 |
+
logger.debug(f"Context string {i + 1}: {content[:100]}...") # Log first 100 chars to avoid extremely long log lines
|
| 27 |
+
|
| 28 |
+
context = "\n".join(self.last_context_strings)
|
| 29 |
+
|
| 30 |
+
messages = [
|
| 31 |
+
{"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
|
| 32 |
+
{"role": "user", "content": f"Context: {context}\n\nQuestion: {user_input}"}
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
logger.debug("Sending request to OpenAI API")
|
| 36 |
+
response = self.client.chat.completions.create(
|
| 37 |
+
model="gpt-3.5-turbo",
|
| 38 |
+
messages=messages
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
answer = response.choices[0].message.content.strip()
|
| 42 |
+
logger.info("Generated response from OpenAI API")
|
| 43 |
+
return answer
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Error in chatbot method: {str(e)}", exc_info=True)
|
| 46 |
+
return f"An error occurred: {str(e)}"
|
| 47 |
+
|
| 48 |
+
def get_last_context_strings(self):
|
| 49 |
+
"""
|
| 50 |
+
Retrieve the list of content strings from the last chatbot query.
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
list: A list of strings, each representing the content of a chunk used in the last context.
|
| 54 |
+
"""
|
| 55 |
+
logger.info(f"Retrieving last context strings (count: {len(self.last_context_strings)})")
|
| 56 |
+
return self.last_context_strings
|
chatbot_ui.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from web_crawler import WebCrawler
|
| 3 |
+
from rag_system import RAGSystem
|
| 4 |
+
from chat_demo import ChatDemo
|
| 5 |
+
import validators
|
| 6 |
+
import logging
|
| 7 |
+
from proxy_thread import ProxyThread
|
| 8 |
+
import requests
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Instantiate your existing classes
|
| 15 |
+
rag_system = RAGSystem()
|
| 16 |
+
chat_demo = ChatDemo(rag_system)
|
| 17 |
+
|
| 18 |
+
# Global variables to track the URL and server state
|
| 19 |
+
url = ""
|
| 20 |
+
proxy_thread = None
|
| 21 |
+
|
| 22 |
+
def start_proxy_server():
|
| 23 |
+
global proxy_thread
|
| 24 |
+
# If the proxy server is already running, stop it first
|
| 25 |
+
if proxy_thread and proxy_thread.is_alive():
|
| 26 |
+
logger.info("Stopping the existing ProxyThread...")
|
| 27 |
+
proxy_thread.stop()
|
| 28 |
+
|
| 29 |
+
# Start a new proxy server
|
| 30 |
+
proxy_thread = ProxyThread(host='localhost', port=5000)
|
| 31 |
+
proxy_thread.start()
|
| 32 |
+
logger.info("New ProxyThread started.")
|
| 33 |
+
|
| 34 |
+
def load_website(input_url):
|
| 35 |
+
global url
|
| 36 |
+
url = input_url # Update the global url variable
|
| 37 |
+
logger.info(f"Loading website: {url}")
|
| 38 |
+
|
| 39 |
+
# Restart the ProxyServer with the new URL
|
| 40 |
+
start_proxy_server()
|
| 41 |
+
|
| 42 |
+
# Proxy server expects a specific URL format with target_url
|
| 43 |
+
proxied_url = f"http://127.0.0.1:5000/?target_url={url}"
|
| 44 |
+
|
| 45 |
+
iframe_html = f"""
|
| 46 |
+
<iframe src="{proxied_url}" width="100%" height="600px"></iframe>
|
| 47 |
+
"""
|
| 48 |
+
return iframe_html
|
| 49 |
+
|
| 50 |
+
def highlight_text(text):
|
| 51 |
+
if not url:
|
| 52 |
+
return "No website loaded."
|
| 53 |
+
|
| 54 |
+
logger.info(f"Highlighting text: {text}")
|
| 55 |
+
try:
|
| 56 |
+
response = requests.post(f'http://127.0.0.1:5000/set_highlight', json={"highlight": text})
|
| 57 |
+
if response.status_code == 200:
|
| 58 |
+
return "Highlight applied."
|
| 59 |
+
else:
|
| 60 |
+
return "Failed to apply highlight."
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Error highlighting text: {e}")
|
| 63 |
+
return f"Error: {e}"
|
| 64 |
+
|
| 65 |
+
def clear_highlights():
|
| 66 |
+
if not url:
|
| 67 |
+
return "No website loaded."
|
| 68 |
+
|
| 69 |
+
logger.info("Clearing highlights and reloading the website.")
|
| 70 |
+
start_proxy_server() # Restart the proxy server to clear highlights
|
| 71 |
+
return load_website(url) # Reload the current website without highlights
|
| 72 |
+
|
| 73 |
+
# Function to handle the initial URL submission
|
| 74 |
+
def process_url(input_url):
|
| 75 |
+
logger.info(f"Processing URL: {input_url}")
|
| 76 |
+
if not validators.url(input_url):
|
| 77 |
+
logger.error(f"Invalid URL submitted: {input_url}")
|
| 78 |
+
return "Invalid URL. Please enter a valid URL.", None
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
# Display loading message
|
| 82 |
+
status_message = "Crawling website and processing data..."
|
| 83 |
+
logger.info(status_message)
|
| 84 |
+
|
| 85 |
+
# Instantiate WebCrawler with the provided URL
|
| 86 |
+
web_crawler = WebCrawler()
|
| 87 |
+
|
| 88 |
+
# Crawl the website
|
| 89 |
+
logger.info(f"Starting web crawl for {input_url}")
|
| 90 |
+
extracted_content = web_crawler.crawl(input_url, 0)
|
| 91 |
+
logger.info(f"Web crawl completed for {input_url}")
|
| 92 |
+
|
| 93 |
+
# Process the data with the RAG system
|
| 94 |
+
logger.info("Processing extracted content with RAG system")
|
| 95 |
+
rag_system.process_content(extracted_content)
|
| 96 |
+
|
| 97 |
+
# Load the website through the proxy
|
| 98 |
+
iframe_html = load_website(input_url)
|
| 99 |
+
|
| 100 |
+
logger.info("URL processing completed successfully")
|
| 101 |
+
return "Website content successfully crawled and processed!", [], iframe_html
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Error processing URL {input_url}: {str(e)}", exc_info=True)
|
| 104 |
+
return f"Error: {str(e)}", []
|
| 105 |
+
|
| 106 |
+
# Function to handle chatbot interactions
|
| 107 |
+
def chatbot_response(user_input, chat_history):
|
| 108 |
+
logger.info(f"Received user input: {user_input}")
|
| 109 |
+
try:
|
| 110 |
+
# Use the ChatDemo class to generate a response
|
| 111 |
+
logger.info("Generating chatbot response")
|
| 112 |
+
response = chat_demo.chatbot(user_input)
|
| 113 |
+
chat_history.append(["User", user_input])
|
| 114 |
+
chat_history.append(["Chatbot", response])
|
| 115 |
+
|
| 116 |
+
logger.info("Chatbot response generated successfully")
|
| 117 |
+
|
| 118 |
+
# Get the context strings used for the response
|
| 119 |
+
context_strings = chat_demo.get_last_context_strings()
|
| 120 |
+
logger.info(f"Retrieved {len(context_strings)} context strings")
|
| 121 |
+
|
| 122 |
+
# Highlight each context string individually
|
| 123 |
+
for i, context in enumerate(context_strings, 1):
|
| 124 |
+
highlight_result = highlight_text(context)
|
| 125 |
+
logger.info(f"Highlight result for context {i}: {highlight_result}")
|
| 126 |
+
|
| 127 |
+
# Prepare status message
|
| 128 |
+
highlight_status = f"Highlighted {len(context_strings)} context passages"
|
| 129 |
+
logger.info(highlight_status)
|
| 130 |
+
|
| 131 |
+
# Update the chat history and return
|
| 132 |
+
return chat_history, chat_history, highlight_status
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.error(f"Error in chatbot_response: {str(e)}", exc_info=True)
|
| 135 |
+
return [[f"Error: {str(e)}"], chat_history], chat_history, f"Error: {str(e)}"
|
| 136 |
+
|
| 137 |
+
# Function to reset the application
|
| 138 |
+
def reset_app():
|
| 139 |
+
global url
|
| 140 |
+
url = "" # Clear the global URL
|
| 141 |
+
logger.info("Resetting application and proxy server")
|
| 142 |
+
start_proxy_server() # Restart the proxy server for a fresh session
|
| 143 |
+
return "", [], "", ""
|
| 144 |
+
|
| 145 |
+
# Build the Gradio interface
|
| 146 |
+
with gr.Blocks() as demo:
|
| 147 |
+
gr.Markdown("# Website Concierge")
|
| 148 |
+
|
| 149 |
+
with gr.Row():
|
| 150 |
+
with gr.Column(scale=1):
|
| 151 |
+
url_input = gr.Textbox(placeholder="Enter a website URL", label="Website URL", interactive=True)
|
| 152 |
+
submit_button = gr.Button("Submit URL")
|
| 153 |
+
status_message = gr.Textbox(label="Status", interactive=False)
|
| 154 |
+
|
| 155 |
+
chat_history = gr.State(value=[])
|
| 156 |
+
chatbot_output = gr.Chatbot(label="Chat History")
|
| 157 |
+
user_input = gr.Textbox(placeholder="Ask the chatbot...", label="User Input", interactive=True)
|
| 158 |
+
|
| 159 |
+
highlight_status = gr.Textbox(label="Highlight Status", interactive=False)
|
| 160 |
+
|
| 161 |
+
clear_button = gr.Button("Clear Highlights")
|
| 162 |
+
reset_button = gr.Button("Change Website")
|
| 163 |
+
|
| 164 |
+
with gr.Column(scale=1):
|
| 165 |
+
proxied_view = gr.HTML(label="Website View")
|
| 166 |
+
|
| 167 |
+
# Initial URL submission
|
| 168 |
+
submit_button.click(process_url, inputs=url_input, outputs=[status_message, chat_history, proxied_view], queue=True)
|
| 169 |
+
|
| 170 |
+
# Handle chatbot responses
|
| 171 |
+
user_input.submit(chatbot_response, inputs=[user_input, chat_history], outputs=[chatbot_output, chat_history, highlight_status])
|
| 172 |
+
|
| 173 |
+
# Handle clearing highlights
|
| 174 |
+
clear_button.click(clear_highlights, outputs=[proxied_view])
|
| 175 |
+
|
| 176 |
+
# Handle reset button click
|
| 177 |
+
reset_button.click(reset_app, outputs=[url_input, chat_history, status_message, proxied_view])
|
| 178 |
+
|
| 179 |
+
# Launch the app
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
logger.info("Starting Gradio application")
|
| 182 |
+
start_proxy_server() # Start with an initial ProxyServer
|
| 183 |
+
demo.launch()
|
| 184 |
+
logger.info("Gradio application stopped")
|
project_snapshot_no_images.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
".env": "OPENAI_API_KEY=sk-proj-8BdwEtUg42Q651Jy0GPFT3BlbkFJLmIfOiZ3cXLaD5NyHqAF\n",
|
| 3 |
+
"proxy_server.py": "import time\nfrom flask import Flask, request, Response, session, jsonify\nfrom flask_socketio import SocketIO, emit\nfrom werkzeug.serving import make_server\nimport requests\nimport re\nimport logging\nimport threading\n\nclass ProxyServer:\n def __init__(self, secret_key, host='localhost', port=5000):\n self.host = host\n self.port = port\n self.app = Flask(__name__)\n self.app.secret_key = secret_key\n self.server = None\n self.is_running = False\n self.socketio = SocketIO(self.app) # Initialize SocketIO with the Flask app\n # self.server_thread = None # Thread for running the server\n # self.server_running = False # Flag to track server state\n self.setup_routes()\n self.highlight_word = None # Initialize the highlight word\n \n @self.app.route('/shutdown', methods=['POST'])\n def shutdown():\n self.logger.info(\"Shutdown request received\")\n self.shutdown_server()\n return 'Server shutting down...'\n \n # Setup logging\n self.logger = logging.getLogger('ProxyServer')\n self.logger.setLevel(logging.DEBUG) # Set the logging level (DEBUG, INFO, WARNING, ERROR)\n handler = logging.StreamHandler() # Log to standard output (can be customized to log to a file)\n handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))\n self.logger.addHandler(handler)\n\n self.logger.info(\"Proxy server initialized\")\n\n # Inject JavaScript into HTML content to highlight words and listen for WebSocket updates\n def inject_script(self, content):\n # Inject the WebSocket listening script\n script = f\"\"\"\n <script src=\"https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js\"></script>\n <script>\n let currentHighlight = \"{self.highlight_word}\";\n function highlightWord(word) {{\n if (word) {{\n document.body.innerHTML = document.body.innerHTML.replace(\n new RegExp(word, 'g'),\n '<span style=\"background-color: yellow;\">' + word + '</span>'\n );\n }}\n }}\n highlightWord(currentHighlight);\n\n // Connect to WebSocket\n const socket = io();\n socket.on('new_highlight', function(data) {{\n currentHighlight = data.highlight;\n highlightWord(currentHighlight);\n }});\n </script>\n \"\"\"\n return re.sub(r'</body>', script + '</body>', content)\n\n # Ensure the target_url and path are handled correctly\n def build_full_url(self, target_url, path):\n if not target_url.endswith('/') and not path.startswith('/'):\n return f\"{target_url}/{path}\"\n return f\"{target_url}{path}\"\n\n # Route handler for proxying requests\n def proxy(self, path=''):\n target_url = request.args.get('target_url')\n if not target_url and 'target_url' in session:\n target_url = session['target_url']\n elif target_url:\n session['target_url'] = target_url\n\n if not target_url:\n self.logger.error(\"No target_url provided\")\n return \"Error: target_url query parameter is required\", 400\n\n full_target_url = self.build_full_url(target_url, path)\n self.logger.info(f\"Proxying request to {full_target_url}\")\n\n headers = {key: value for key, value in request.headers if key != 'Host'}\n\n # Handle POST or GET requests\n if request.method == 'POST':\n response = requests.post(full_target_url, headers=headers, data=request.get_data(), stream=True)\n else:\n response = requests.get(full_target_url, headers=headers, stream=True)\n\n # If it's HTML content, inject the script\n if 'text/html' in response.headers.get('Content-Type', ''):\n def generate():\n for chunk in response.iter_content(chunk_size=1024):\n if chunk:\n rewritten_chunk = self.inject_script(chunk.decode('utf-8'))\n yield rewritten_chunk.encode('utf-8')\n self.logger.info(f\"Injecting script into HTML response from {full_target_url}\")\n return Response(generate(), content_type=response.headers['Content-Type'])\n\n # Stream non-HTML content (images, scripts, etc.)\n else:\n def generate():\n for chunk in response.iter_content(chunk_size=1024):\n if chunk:\n yield chunk\n return Response(generate(), content_type=response.headers['Content-Type'])\n\n # API endpoint to set a new highlight word\n def set_highlight(self):\n new_highlight = request.json.get('highlight')\n if new_highlight:\n self.highlight_word = new_highlight\n # Emit the new highlight word to all connected clients\n self.socketio.emit('new_highlight', {'highlight': new_highlight})\n self.logger.info(f\"Highlight word updated to '{new_highlight}' and broadcasted to clients\")\n return jsonify({\"message\": \"Highlight word updated\", \"highlight\": new_highlight}), 200\n self.logger.error(\"No highlight word provided\")\n return jsonify({\"error\": \"No highlight word provided\"}), 400\n\n # Setup routes to proxy all requests and WebSocket events\n def setup_routes(self):\n self.app.add_url_rule('/', defaults={'path': ''}, view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n self.app.add_url_rule('/<path:path>', view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])\n self.app.add_url_rule('/set_highlight', view_func=self.set_highlight, methods=['POST'])\n\n def run(self):\n \"\"\"Runs the Werkzeug server\"\"\"\n logging.info(f\"Starting server on {self.host}:{self.port}\")\n self.server = make_server(self.host, self.port, self.app, threaded=True)\n self.is_running = True\n self.server.serve_forever()\n\n def shutdown_server(self):\n \"\"\"Shuts down the Werkzeug server\"\"\"\n if self.server:\n self.logger.info(\"Shutting down server...\")\n self.is_running = False\n self.server.shutdown()\n self.logger.info(\"Server shut down complete\")\n\n# Create an instance of ProxyServer and expose the app\n# proxy_server = ProxyServer(secret_key='your_secret_key_here')\n# app = proxy_server.app # Expose the Flask app to the top-level scope for Flask CLI\n\n# if __name__ == '__main__':\n# proxy_server.run(port=5000)\n",
|
| 4 |
+
"rag_system.py": "import openai\nimport faiss\nimport numpy as np\nimport sys\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom chat_demo import ChatDemo\nfrom web_crawler import WebCrawler \n\nclass RAGSystem:\n def __init__(self, model_name=\"text-embedding-ada-002\"):\n self.client = openai.OpenAI()\n self.model_name = model_name\n self.index = None\n self.faiss_data = []\n\n def split_into_chunks(self, page_data, max_chunk_size=500):\n chunks = []\n for page in page_data:\n url = page['url']\n for paragraph in page['paragraphs']:\n if len(paragraph) <= max_chunk_size:\n chunks.append({'content': paragraph, 'url': url})\n else:\n # Break long paragraphs into smaller chunks\n for i in range(0, len(paragraph), max_chunk_size):\n chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})\n return chunks\n\n def compute_embeddings(self, text_chunks):\n texts = [chunk['content'] for chunk in text_chunks]\n response = self.client.embeddings.create(model=self.model_name, input=texts)\n embeddings = [result.embedding for result in response.data]\n return embeddings\n\n def store_embeddings_in_faiss(self, embeddings, text_chunks):\n # Initialize a FAISS index\n dimension = len(embeddings[0])\n self.index = faiss.IndexFlatL2(dimension)\n \n for idx, embedding in enumerate(embeddings):\n np_embedding = np.array(embedding, dtype='float32')\n self.index.add(np_embedding.reshape(1, -1))\n self.faiss_data.append({\n 'embedding': np_embedding,\n 'content': text_chunks[idx]['content'],\n 'url': text_chunks[idx]['url']\n })\n\n def process_content(self, website_data):\n # Split data into chunks\n text_chunks = self.split_into_chunks(website_data)\n \n # Compute embeddings and create vector database\n embeddings = self.compute_embeddings(text_chunks)\n self.store_embeddings_in_faiss(embeddings, text_chunks)\n\n def process_user_query(self, query):\n # Compute the embedding of the query\n response = self.client.embeddings.create(model=self.model_name, input=[query])\n query_embedding = response.data[0].embedding\n np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)\n\n # Find the most similar embeddings\n distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results\n similar_chunks = [self.faiss_data[i] for i in indices[0]]\n\n return similar_chunks\n\n# Main function\nif __name__ == \"__main__\":\n if len(sys.argv) != 2:\n print(\"Usage: python rag_manager.py <url>\")\n sys.exit(1)\n\n url = sys.argv[1]\n # Crawl the website\n crawler = WebCrawler()\n website_data = crawler.crawl(url, 2)\n print(\"website data\", website_data)\n\n # Initialize RAGManager\n rag_system = RAGSystem()\n\n # Process content from page data\n rag_system.process_content(website_data)\n\n # Initialize ChatDemo with RAGManager\n chat_demo = ChatDemo(rag_system)\n\n # Command-line interaction loop for user queries\n while True:\n user_query = input(\"You: \")\n if user_query.lower() in [\"exit\", \"quit\"]:\n print(\"Goodbye!\")\n break\n response = chat_demo.chatbot(user_query)\n print(f\"Assistant: {response}\")",
|
| 5 |
+
"web_crawler.py": "import requests\nfrom bs4 import BeautifulSoup\nfrom urllib.parse import urljoin, urlparse\nimport json\nimport re\nimport time\nimport sys\n\nclass WebCrawler:\n def __init__(self, max_depth=2):\n self.base_url = None\n self.visited = set()\n self.max_depth = max_depth\n self.data = []\n self.session = requests.Session()\n self.delay = 0.1 # Delay between requests to prevent overwhelming the server\n\n def can_crawl(self, url):\n \"\"\"Check robots.txt to see if crawling is allowed.\"\"\"\n parsed_url = urlparse(url)\n robots_url = urljoin(f\"{parsed_url.scheme}://{parsed_url.netloc}\", '/robots.txt')\n try:\n response = self.session.get(robots_url, timeout=10)\n if response.status_code == 200:\n disallowed_paths = re.findall(r'Disallow: (.+)', response.text)\n for path in disallowed_paths:\n if url.startswith(urljoin(self.base_url, path.strip())):\n return False\n except requests.RequestException:\n pass\n return True\n\n def fetch(self, url):\n \"\"\"Fetch the content of a URL.\"\"\"\n try:\n response = self.session.get(url, timeout=10)\n response.raise_for_status()\n return response.text\n except requests.RequestException as e:\n print(f\"Error fetching {url}: {e}\")\n return None\n\n def parse(self, html_content, url):\n \"\"\"Parse the HTML content and extract headings, paragraphs, and links.\"\"\"\n soup = BeautifulSoup(html_content, 'html.parser')\n page_data = {\n 'url': url,\n 'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],\n 'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],\n }\n self.data.append(page_data)\n return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]\n\n def crawl(self, url, depth):\n \"\"\"Crawl a given URL up to a certain depth.\"\"\"\n if depth > self.max_depth or url in self.visited or not self.can_crawl(url):\n return\n\n print(f\"Crawling: {url} at depth {depth}\")\n self.base_url = url\n self.visited.add(url)\n html_content = self.fetch(url)\n if html_content:\n links = self.parse(html_content, url)\n for link in links:\n if link.startswith(self.base_url): # Stay within the same domain\n time.sleep(self.delay) # Respectful crawling\n self.crawl(link, depth + 1)\n return self.get_data()\n\n def get_data(self):\n \"\"\"Return the crawled data.\"\"\"\n return self.data\n\nif __name__ == \"__main__\":\n if len(sys.argv) != 2:\n print(\"Usage: python web_crawler.py <URL>\")\n sys.exit(1)\n\n base_url = sys.argv[1]\n crawler = WebCrawler(max_depth=2)\n data = crawler.crawl(base_url, 0)\n print(json.dumps(data, indent=4))\n",
|
| 6 |
+
".gitignore": "__pycache__\n",
|
| 7 |
+
"chatbot_ui.py": "import gradio as gr\nfrom web_crawler import WebCrawler\nfrom rag_system import RAGSystem\nfrom chat_demo import ChatDemo\nimport validators\n\n# Instantiate your existing classes\nrag_system = RAGSystem()\nchat_demo = ChatDemo(rag_system)\n\n# Function to handle the initial URL submission\ndef process_url(url):\n if not validators.url(url):\n return \"Invalid URL. Please enter a valid URL.\", None\n \n try:\n # Display loading message\n status_message = \"Crawling website and processing data...\"\n \n # Instantiate WebCrawler with the provided URL\n web_crawler = WebCrawler()\n \n # Crawl the website\n extracted_content = web_crawler.crawl(url, 0)\n \n # Process the data with the RAG system\n rag_system.process_content(extracted_content)\n\n # Display the website content in an iframe\n iframe_html = f'<iframe src=\"{url}\" width=\"100%\" height=\"500px\"></iframe>'\n \n return \"Website content successfully crawled and processed!\", [], iframe_html\n except Exception as e:\n return f\"Error: {str(e)}\", []\n\n# Function to handle chatbot interactions\ndef chatbot_response(user_input, chat_history):\n try:\n # Use the ChatDemo class to generate a response\n response = chat_demo.chatbot(user_input)\n chat_history.append([\"User\", user_input])\n chat_history.append([\"Chatbot\", response])\n \n # Update the chat history and return\n return chat_history, chat_history\n except Exception as e:\n return [[\"Error\", str(e)]], chat_history\n\n# Function to reset the application\ndef reset_app():\n return \"\", [], \"\", \"\"\n\n# Build the Gradio interface\nwith gr.Blocks() as demo:\n url_input = gr.Textbox(placeholder=\"Enter a website URL\", label=\"Website URL\", interactive=True)\n submit_button = gr.Button(\"Submit URL\")\n status_message = gr.Textbox(label=\"Status\", interactive=False)\n chat_history = gr.State(value=[])\n chatbot_output = gr.Chatbot(label=\"Chat History\")\n user_input = gr.Textbox(placeholder=\"Ask the chatbot...\", label=\"User Input\", interactive=True)\n embedded_view = gr.HTML(label=\"Website View\")\n reset_button = gr.Button(\"Change Website\")\n\n # Initial URL submission\n submit_button.click(process_url, inputs=url_input, outputs=[status_message, chat_history, embedded_view], queue=True)\n\n # Handle chatbot responses\n user_input.submit(chatbot_response, inputs=[user_input, chat_history], outputs=[chatbot_output, chat_history])\n\n # Handle reset button click\n reset_button.click(reset_app, outputs=[url_input, chat_history, status_message, embedded_view])\n\n# Launch the app\ndemo.launch()",
|
| 8 |
+
".flaskenv": "FLASK_APP=proxy_server.py\nFLASK_ENV=development\nFLASK_DEBUG=1\n",
|
| 9 |
+
"proxy_thread.py": "import threading\nimport requests\nimport logging\nimport time\nfrom proxy_server import ProxyServer\n\nlogging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')\nlogger = logging.getLogger(__name__)\n\nclass ProxyThread(threading.Thread):\n def __init__(self, host='localhost', port=5000):\n threading.Thread.__init__(self)\n secret_key='your_secret_key_here'\n self.proxy_server = ProxyServer(secret_key, host, port)\n\n def run(self):\n \"\"\"Starts the proxy server in a thread\"\"\"\n logger.info(\"ProxyThread starting\")\n self.proxy_server.run()\n\n def stop(self):\n \"\"\"Stops the proxy server by sending a shutdown request\"\"\"\n logger.info(\"Stopping ProxyThread\")\n try:\n response = requests.post(f'http://{self.proxy_server.host}:{self.proxy_server.port}/shutdown', timeout=5)\n logger.info(f\"Shutdown request sent. Response: {response.text}\")\n except requests.RequestException as e:\n logger.error(f\"Error sending shutdown request: {e}\")\n\n # Wait for the server to shut down\n timeout = 10\n start_time = time.time()\n while self.proxy_server.is_running and time.time() - start_time < timeout:\n time.sleep(0.1)\n\n if self.proxy_server.is_running:\n logger.warning(\"Server did not shut down in time\")\n else:\n logger.info(\"Server has been shut down\")\n\n self.join(timeout=5)\n logger.info(\"ProxyThread stopped\")\n\n"
|
| 10 |
+
}
|
proxy_server.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from flask import Flask, request, Response, session, jsonify
|
| 3 |
+
from flask_socketio import SocketIO, emit
|
| 4 |
+
from werkzeug.serving import make_server
|
| 5 |
+
import requests
|
| 6 |
+
import re
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
# Configure logging
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
class ProxyServer:
|
| 13 |
+
def __init__(self, secret_key, host='localhost', port=5000):
|
| 14 |
+
self.host = host
|
| 15 |
+
self.port = port
|
| 16 |
+
self.app = Flask(__name__)
|
| 17 |
+
self.app.secret_key = secret_key
|
| 18 |
+
self.server = None
|
| 19 |
+
self.is_running = False
|
| 20 |
+
self.socketio = SocketIO(self.app) # Initialize SocketIO with the Flask app
|
| 21 |
+
# self.server_thread = None # Thread for running the server
|
| 22 |
+
# self.server_running = False # Flag to track server state
|
| 23 |
+
self.setup_routes()
|
| 24 |
+
self.highlight_word = None # Initialize the highlight word
|
| 25 |
+
|
| 26 |
+
@self.app.route('/shutdown', methods=['POST'])
|
| 27 |
+
def shutdown():
|
| 28 |
+
logger.info("Shutdown request received")
|
| 29 |
+
self.shutdown_server()
|
| 30 |
+
return 'Server shutting down...'
|
| 31 |
+
|
| 32 |
+
logger.info("Proxy server initialized")
|
| 33 |
+
|
| 34 |
+
# Inject JavaScript into HTML content to highlight words and listen for WebSocket updates
|
| 35 |
+
def inject_script(self, content):
|
| 36 |
+
# Inject the WebSocket listening script
|
| 37 |
+
script = f"""
|
| 38 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js"></script>
|
| 39 |
+
<script>
|
| 40 |
+
let currentHighlight = "{self.highlight_word}";
|
| 41 |
+
function highlightWord(word) {{
|
| 42 |
+
if (word) {{
|
| 43 |
+
document.body.innerHTML = document.body.innerHTML.replace(
|
| 44 |
+
new RegExp(word, 'g'),
|
| 45 |
+
'<span style="background-color: yellow;">' + word + '</span>'
|
| 46 |
+
);
|
| 47 |
+
}}
|
| 48 |
+
}}
|
| 49 |
+
highlightWord(currentHighlight);
|
| 50 |
+
|
| 51 |
+
// Connect to WebSocket
|
| 52 |
+
const socket = io();
|
| 53 |
+
socket.on('new_highlight', function(data) {{
|
| 54 |
+
currentHighlight = data.highlight;
|
| 55 |
+
highlightWord(currentHighlight);
|
| 56 |
+
}});
|
| 57 |
+
</script>
|
| 58 |
+
"""
|
| 59 |
+
return re.sub(r'</body>', script + '</body>', content)
|
| 60 |
+
|
| 61 |
+
# Ensure the target_url and path are handled correctly
|
| 62 |
+
def build_full_url(self, target_url, path):
|
| 63 |
+
if not target_url.endswith('/') and not path.startswith('/'):
|
| 64 |
+
return f"{target_url}/{path}"
|
| 65 |
+
return f"{target_url}{path}"
|
| 66 |
+
|
| 67 |
+
# Route handler for proxying requests
|
| 68 |
+
def proxy(self, path=''):
|
| 69 |
+
target_url = request.args.get('target_url')
|
| 70 |
+
if not target_url and 'target_url' in session:
|
| 71 |
+
target_url = session['target_url']
|
| 72 |
+
elif target_url:
|
| 73 |
+
session['target_url'] = target_url
|
| 74 |
+
|
| 75 |
+
if not target_url:
|
| 76 |
+
logger.error("No target_url provided")
|
| 77 |
+
return "Error: target_url query parameter is required", 400
|
| 78 |
+
|
| 79 |
+
full_target_url = self.build_full_url(target_url, path)
|
| 80 |
+
logger.info(f"Proxying request to {full_target_url}")
|
| 81 |
+
|
| 82 |
+
headers = {key: value for key, value in request.headers if key != 'Host'}
|
| 83 |
+
|
| 84 |
+
# Handle POST or GET requests
|
| 85 |
+
if request.method == 'POST':
|
| 86 |
+
response = requests.post(full_target_url, headers=headers, data=request.get_data(), stream=True)
|
| 87 |
+
else:
|
| 88 |
+
response = requests.get(full_target_url, headers=headers, stream=True)
|
| 89 |
+
|
| 90 |
+
# If it's HTML content, inject the script
|
| 91 |
+
if 'text/html' in response.headers.get('Content-Type', ''):
|
| 92 |
+
def generate():
|
| 93 |
+
for chunk in response.iter_content(chunk_size=1024):
|
| 94 |
+
if chunk:
|
| 95 |
+
rewritten_chunk = self.inject_script(chunk.decode('utf-8'))
|
| 96 |
+
yield rewritten_chunk.encode('utf-8')
|
| 97 |
+
logger.info(f"Injecting script into HTML response from {full_target_url}")
|
| 98 |
+
return Response(generate(), content_type=response.headers['Content-Type'])
|
| 99 |
+
|
| 100 |
+
# Stream non-HTML content (images, scripts, etc.)
|
| 101 |
+
else:
|
| 102 |
+
def generate():
|
| 103 |
+
for chunk in response.iter_content(chunk_size=1024):
|
| 104 |
+
if chunk:
|
| 105 |
+
yield chunk
|
| 106 |
+
return Response(generate(), content_type=response.headers['Content-Type'])
|
| 107 |
+
|
| 108 |
+
# API endpoint to set a new highlight word
|
| 109 |
+
def set_highlight(self):
|
| 110 |
+
new_highlight = request.json.get('highlight')
|
| 111 |
+
if new_highlight:
|
| 112 |
+
self.highlight_word = new_highlight
|
| 113 |
+
# Emit the new highlight word to all connected clients
|
| 114 |
+
self.socketio.emit('new_highlight', {'highlight': new_highlight})
|
| 115 |
+
logger.info(f"Highlight word updated to '{new_highlight}' and broadcasted to clients")
|
| 116 |
+
return jsonify({"message": "Highlight word updated", "highlight": new_highlight}), 200
|
| 117 |
+
logger.error("No highlight word provided")
|
| 118 |
+
return jsonify({"error": "No highlight word provided"}), 400
|
| 119 |
+
|
| 120 |
+
# Setup routes to proxy all requests and WebSocket events
|
| 121 |
+
def setup_routes(self):
|
| 122 |
+
self.app.add_url_rule('/', defaults={'path': ''}, view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])
|
| 123 |
+
self.app.add_url_rule('/<path:path>', view_func=self.proxy, methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'])
|
| 124 |
+
self.app.add_url_rule('/set_highlight', view_func=self.set_highlight, methods=['POST'])
|
| 125 |
+
|
| 126 |
+
def run(self):
|
| 127 |
+
"""Runs the Werkzeug server"""
|
| 128 |
+
logging.info(f"Starting server on {self.host}:{self.port}")
|
| 129 |
+
self.server = make_server(self.host, self.port, self.app, threaded=True)
|
| 130 |
+
self.is_running = True
|
| 131 |
+
self.server.serve_forever()
|
| 132 |
+
|
| 133 |
+
def shutdown_server(self):
|
| 134 |
+
"""Shuts down the Werkzeug server"""
|
| 135 |
+
if self.server:
|
| 136 |
+
logger.info("Shutting down server...")
|
| 137 |
+
self.is_running = False
|
| 138 |
+
self.server.shutdown()
|
| 139 |
+
logger.info("Server shut down complete")
|
| 140 |
+
|
| 141 |
+
# Create an instance of ProxyServer and expose the app
|
| 142 |
+
# proxy_server = ProxyServer(secret_key='your_secret_key_here')
|
| 143 |
+
# app = proxy_server.app # Expose the Flask app to the top-level scope for Flask CLI
|
| 144 |
+
|
| 145 |
+
# if __name__ == '__main__':
|
| 146 |
+
# proxy_server.run(port=5000)
|
proxy_thread.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import threading
|
| 2 |
+
import requests
|
| 3 |
+
import logging
|
| 4 |
+
import time
|
| 5 |
+
from proxy_server import ProxyServer
|
| 6 |
+
|
| 7 |
+
# Configure logging
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class ProxyThread(threading.Thread):
|
| 11 |
+
def __init__(self, host='localhost', port=5000):
|
| 12 |
+
threading.Thread.__init__(self)
|
| 13 |
+
secret_key='your_secret_key_here'
|
| 14 |
+
self.proxy_server = ProxyServer(secret_key, host, port)
|
| 15 |
+
|
| 16 |
+
def run(self):
|
| 17 |
+
"""Starts the proxy server in a thread"""
|
| 18 |
+
logger.info("ProxyThread starting")
|
| 19 |
+
self.proxy_server.run()
|
| 20 |
+
|
| 21 |
+
def stop(self):
|
| 22 |
+
"""Stops the proxy server by sending a shutdown request"""
|
| 23 |
+
logger.info("Stopping ProxyThread")
|
| 24 |
+
try:
|
| 25 |
+
response = requests.post(f'http://{self.proxy_server.host}:{self.proxy_server.port}/shutdown', timeout=5)
|
| 26 |
+
logger.info(f"Shutdown request sent. Response: {response.text}")
|
| 27 |
+
except requests.RequestException as e:
|
| 28 |
+
logger.error(f"Error sending shutdown request: {e}")
|
| 29 |
+
|
| 30 |
+
# Wait for the server to shut down
|
| 31 |
+
timeout = 10
|
| 32 |
+
start_time = time.time()
|
| 33 |
+
while self.proxy_server.is_running and time.time() - start_time < timeout:
|
| 34 |
+
time.sleep(0.1)
|
| 35 |
+
|
| 36 |
+
if self.proxy_server.is_running:
|
| 37 |
+
logger.warning("Server did not shut down in time")
|
| 38 |
+
else:
|
| 39 |
+
logger.info("Server has been shut down")
|
| 40 |
+
|
| 41 |
+
self.join(timeout=5)
|
| 42 |
+
logger.info("ProxyThread stopped")
|
| 43 |
+
|
rag_system.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
import faiss
|
| 3 |
+
import numpy as np
|
| 4 |
+
import logging
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
|
| 7 |
+
# Configure logging
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class RAGSystem:
|
| 11 |
+
def __init__(self, model_name="text-embedding-ada-002"):
|
| 12 |
+
self.client = openai.OpenAI()
|
| 13 |
+
self.model_name = model_name
|
| 14 |
+
self.index = None
|
| 15 |
+
self.faiss_data = []
|
| 16 |
+
logger.info(f"RAGSystem initialized with model: {model_name}")
|
| 17 |
+
|
| 18 |
+
def split_into_chunks(self, page_data, max_chunk_size=500):
|
| 19 |
+
logger.info(f"Splitting data into chunks with max size: {max_chunk_size}")
|
| 20 |
+
chunks = []
|
| 21 |
+
for page in page_data:
|
| 22 |
+
url = page['url']
|
| 23 |
+
for paragraph in page['paragraphs']:
|
| 24 |
+
if len(paragraph) <= max_chunk_size:
|
| 25 |
+
chunks.append({'content': paragraph, 'url': url})
|
| 26 |
+
else:
|
| 27 |
+
# Break long paragraphs into smaller chunks
|
| 28 |
+
for i in range(0, len(paragraph), max_chunk_size):
|
| 29 |
+
chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})
|
| 30 |
+
logger.debug(f"Created {len(chunks)} chunks")
|
| 31 |
+
return chunks
|
| 32 |
+
|
| 33 |
+
def compute_embeddings(self, text_chunks):
|
| 34 |
+
logger.info(f"Computing embeddings for {len(text_chunks)} chunks")
|
| 35 |
+
texts = [chunk['content'] for chunk in text_chunks]
|
| 36 |
+
response = self.client.embeddings.create(model=self.model_name, input=texts)
|
| 37 |
+
embeddings = [result.embedding for result in response.data]
|
| 38 |
+
logger.debug(f"Computed {len(embeddings)} embeddings")
|
| 39 |
+
return embeddings
|
| 40 |
+
|
| 41 |
+
def store_embeddings_in_faiss(self, embeddings, text_chunks):
|
| 42 |
+
logger.info("Storing embeddings in FAISS index")
|
| 43 |
+
dimension = len(embeddings[0])
|
| 44 |
+
self.index = faiss.IndexFlatL2(dimension)
|
| 45 |
+
|
| 46 |
+
for idx, embedding in enumerate(embeddings):
|
| 47 |
+
np_embedding = np.array(embedding, dtype='float32')
|
| 48 |
+
self.index.add(np_embedding.reshape(1, -1))
|
| 49 |
+
self.faiss_data.append({
|
| 50 |
+
'embedding': np_embedding,
|
| 51 |
+
'content': text_chunks[idx]['content'],
|
| 52 |
+
'url': text_chunks[idx]['url']
|
| 53 |
+
})
|
| 54 |
+
logger.debug(f"Stored {len(embeddings)} embeddings in FAISS index")
|
| 55 |
+
|
| 56 |
+
def process_content(self, website_data):
|
| 57 |
+
logger.info("Processing website content")
|
| 58 |
+
text_chunks = self.split_into_chunks(website_data)
|
| 59 |
+
embeddings = self.compute_embeddings(text_chunks)
|
| 60 |
+
self.store_embeddings_in_faiss(embeddings, text_chunks)
|
| 61 |
+
logger.info("Content processing completed")
|
| 62 |
+
|
| 63 |
+
def process_user_query(self, query):
|
| 64 |
+
logger.info(f"Processing user query: {query}")
|
| 65 |
+
response = self.client.embeddings.create(model=self.model_name, input=[query])
|
| 66 |
+
query_embedding = response.data[0].embedding
|
| 67 |
+
np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)
|
| 68 |
+
|
| 69 |
+
distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results
|
| 70 |
+
similar_chunks = [self.faiss_data[i] for i in indices[0]]
|
| 71 |
+
logger.debug(f"Retrieved {len(similar_chunks)} similar chunks for the query")
|
| 72 |
+
return similar_chunks
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
beautifulsoup4==4.12.3
|
| 2 |
+
faiss_cpu==1.8.0.post1
|
| 3 |
+
Flask==3.0.3
|
| 4 |
+
Flask_SocketIO==5.4.1
|
| 5 |
+
gradio==4.44.1
|
| 6 |
+
numpy==2.1.2
|
| 7 |
+
openai==1.51.1
|
| 8 |
+
Requests==2.32.3
|
| 9 |
+
scikit_learn==1.5.2
|
| 10 |
+
validators==0.28.1
|
| 11 |
+
Werkzeug==3.0.4
|
web_crawler.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
from urllib.parse import urljoin, urlparse
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
import time
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class WebCrawler:
|
| 14 |
+
def __init__(self, max_depth=2):
|
| 15 |
+
self.base_url = None
|
| 16 |
+
self.visited = set()
|
| 17 |
+
self.max_depth = max_depth
|
| 18 |
+
self.data = []
|
| 19 |
+
self.session = requests.Session()
|
| 20 |
+
self.delay = 0.1 # Delay between requests to prevent overwhelming the server
|
| 21 |
+
logger.info(f"WebCrawler initialized with max_depth: {max_depth}")
|
| 22 |
+
|
| 23 |
+
def can_crawl(self, url):
|
| 24 |
+
logger.debug(f"Checking if can crawl: {url}")
|
| 25 |
+
parsed_url = urlparse(url)
|
| 26 |
+
robots_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", '/robots.txt')
|
| 27 |
+
try:
|
| 28 |
+
response = self.session.get(robots_url, timeout=10)
|
| 29 |
+
if response.status_code == 200:
|
| 30 |
+
disallowed_paths = re.findall(r'Disallow: (.+)', response.text)
|
| 31 |
+
for path in disallowed_paths:
|
| 32 |
+
if url.startswith(urljoin(self.base_url, path.strip())):
|
| 33 |
+
logger.info(f"Crawling not allowed for: {url}")
|
| 34 |
+
return False
|
| 35 |
+
except requests.RequestException:
|
| 36 |
+
logger.warning(f"Error fetching robots.txt for {url}", exc_info=True)
|
| 37 |
+
logger.debug(f"Crawling allowed for: {url}")
|
| 38 |
+
return True
|
| 39 |
+
|
| 40 |
+
def fetch(self, url):
|
| 41 |
+
logger.info(f"Fetching content from: {url}")
|
| 42 |
+
try:
|
| 43 |
+
response = self.session.get(url, timeout=10)
|
| 44 |
+
response.raise_for_status()
|
| 45 |
+
logger.debug(f"Successfully fetched content from: {url}")
|
| 46 |
+
return response.text
|
| 47 |
+
except requests.RequestException as e:
|
| 48 |
+
logger.error(f"Error fetching {url}: {e}", exc_info=True)
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
def parse(self, html_content, url):
|
| 52 |
+
logger.info(f"Parsing HTML content from: {url}")
|
| 53 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 54 |
+
page_data = {
|
| 55 |
+
'url': url,
|
| 56 |
+
'headings': [heading.get_text(strip=True) for heading in soup.find_all(re.compile('^h[1-6]$'))],
|
| 57 |
+
'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')],
|
| 58 |
+
}
|
| 59 |
+
self.data.append(page_data)
|
| 60 |
+
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
|
| 61 |
+
logger.debug(f"Parsed {len(links)} links from {url}")
|
| 62 |
+
return links
|
| 63 |
+
|
| 64 |
+
def crawl(self, url, depth):
|
| 65 |
+
if depth > self.max_depth or url in self.visited or not self.can_crawl(url):
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
logger.info(f"Crawling: {url} at depth {depth}")
|
| 69 |
+
self.base_url = url
|
| 70 |
+
self.visited.add(url)
|
| 71 |
+
html_content = self.fetch(url)
|
| 72 |
+
if html_content:
|
| 73 |
+
links = self.parse(html_content, url)
|
| 74 |
+
for link in links:
|
| 75 |
+
if link.startswith(self.base_url): # Stay within the same domain
|
| 76 |
+
time.sleep(self.delay) # Respectful crawling
|
| 77 |
+
self.crawl(link, depth + 1)
|
| 78 |
+
return self.get_data()
|
| 79 |
+
|
| 80 |
+
def get_data(self):
|
| 81 |
+
logger.info(f"Returning crawled data: {len(self.data)} pages")
|
| 82 |
+
return self.data
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
if len(sys.argv) != 2:
|
| 86 |
+
print("Usage: python web_crawler.py <URL>")
|
| 87 |
+
sys.exit(1)
|
| 88 |
+
|
| 89 |
+
base_url = sys.argv[1]
|
| 90 |
+
crawler = WebCrawler(max_depth=2)
|
| 91 |
+
data = crawler.crawl(base_url, 0)
|
| 92 |
+
print(json.dumps(data, indent=4))
|