Update app.py
Browse files
app.py
CHANGED
|
@@ -8,13 +8,13 @@ import numpy as np
|
|
| 8 |
import requests
|
| 9 |
import time
|
| 10 |
import re
|
| 11 |
-
import base64
|
| 12 |
import logging
|
| 13 |
import os
|
| 14 |
import sys
|
| 15 |
-
import concurrent.futures
|
| 16 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 17 |
import threading
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Import OpenAI library
|
| 20 |
import openai
|
|
@@ -83,9 +83,82 @@ if not GROQ_API_KEY:
|
|
| 83 |
openai.api_key = GROQ_API_KEY
|
| 84 |
openai.api_base = "https://api.groq.com/openai/v1"
|
| 85 |
|
| 86 |
-
#
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
def extract_main_content(soup):
|
| 91 |
"""
|
|
@@ -156,171 +229,169 @@ def get_page_metadata(soup):
|
|
| 156 |
|
| 157 |
return metadata
|
| 158 |
|
| 159 |
-
def
|
| 160 |
"""
|
| 161 |
-
|
| 162 |
"""
|
| 163 |
-
logger.info(
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
retry_count = 0
|
| 167 |
-
|
| 168 |
-
while retry_count < max_retries:
|
| 169 |
try:
|
| 170 |
-
#
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
#
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
prompt =
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
You are an assistant that creates concise webpage summaries and assigns categories.
|
| 226 |
-
Content:
|
| 227 |
-
{content_text}
|
| 228 |
-
Provide:
|
| 229 |
-
1. A concise summary (max two sentences) focusing on the main topic.
|
| 230 |
-
2. Assign the most appropriate category from the list below.
|
| 231 |
-
Categories:
|
| 232 |
-
{', '.join([f'"{cat}"' for cat in CATEGORIES])}
|
| 233 |
-
Format:
|
| 234 |
-
Summary: [Your summary]
|
| 235 |
-
Category: [One category]
|
| 236 |
-
"""
|
| 237 |
-
|
| 238 |
-
def estimate_tokens(text):
|
| 239 |
-
return len(text) / 4
|
| 240 |
-
|
| 241 |
-
prompt_tokens = estimate_tokens(prompt)
|
| 242 |
-
max_tokens = 150
|
| 243 |
-
total_tokens = prompt_tokens + max_tokens
|
| 244 |
-
|
| 245 |
-
tokens_per_minute = 40000
|
| 246 |
-
tokens_per_second = tokens_per_minute / 60
|
| 247 |
-
required_delay = total_tokens / tokens_per_second
|
| 248 |
-
sleep_time = max(required_delay, 2)
|
| 249 |
-
|
| 250 |
-
response = openai.ChatCompletion.create(
|
| 251 |
-
model='llama-3.1-70b-versatile',
|
| 252 |
-
messages=[
|
| 253 |
-
{"role": "user", "content": prompt}
|
| 254 |
-
],
|
| 255 |
-
max_tokens=int(max_tokens),
|
| 256 |
-
temperature=0.5,
|
| 257 |
-
)
|
| 258 |
-
|
| 259 |
-
content = response['choices'][0]['message']['content'].strip()
|
| 260 |
-
if not content:
|
| 261 |
-
raise ValueError("Empty response received from the model.")
|
| 262 |
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
break
|
| 291 |
-
|
| 292 |
-
except openai.error.RateLimitError as e:
|
| 293 |
-
retry_count += 1
|
| 294 |
-
wait_time = int(e.headers.get("Retry-After", 5))
|
| 295 |
-
logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
|
| 296 |
-
time.sleep(wait_time)
|
| 297 |
-
except Exception as e:
|
| 298 |
-
logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
|
| 299 |
-
bookmark['summary'] = 'No summary available.'
|
| 300 |
-
bookmark['category'] = 'Uncategorized'
|
| 301 |
-
break
|
| 302 |
|
| 303 |
-
def
|
| 304 |
"""
|
| 305 |
-
|
| 306 |
"""
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
logger.info(f"Skipping non-http/https URL: {url}")
|
| 319 |
-
logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
|
| 320 |
-
return extracted_bookmarks
|
| 321 |
-
except Exception as e:
|
| 322 |
-
logger.error("Error parsing bookmarks: %s", e, exc_info=True)
|
| 323 |
-
raise
|
| 324 |
|
| 325 |
def fetch_url_info(bookmark):
|
| 326 |
"""
|
|
@@ -382,6 +453,28 @@ def fetch_url_info(bookmark):
|
|
| 382 |
'slow_link': bookmark.get('slow_link', False),
|
| 383 |
}
|
| 384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
def vectorize_and_index(bookmarks_list):
|
| 386 |
"""
|
| 387 |
Create vector embeddings for bookmarks and build FAISS index with ID mapping.
|
|
@@ -453,6 +546,14 @@ def display_bookmarks():
|
|
| 453 |
logger.info("HTML display generated")
|
| 454 |
return cards
|
| 455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
def process_uploaded_file(file, state_bookmarks):
|
| 457 |
"""
|
| 458 |
Process the uploaded bookmarks file.
|
|
@@ -489,10 +590,14 @@ def process_uploaded_file(file, state_bookmarks):
|
|
| 489 |
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 490 |
executor.map(fetch_url_info, bookmarks)
|
| 491 |
|
| 492 |
-
#
|
| 493 |
-
logger.info("
|
| 494 |
-
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
try:
|
| 498 |
faiss_index = vectorize_and_index(bookmarks)
|
|
@@ -619,15 +724,12 @@ def chatbot_response(user_query, chat_history):
|
|
| 619 |
try:
|
| 620 |
chat_history.append({"role": "user", "content": user_query})
|
| 621 |
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
|
| 629 |
-
time.sleep(sleep_duration)
|
| 630 |
-
last_api_call_time = time.time()
|
| 631 |
|
| 632 |
query_vector = embedding_model.encode([user_query]).astype('float32')
|
| 633 |
k = 5
|
|
@@ -635,7 +737,7 @@ def chatbot_response(user_query, chat_history):
|
|
| 635 |
ids = ids.flatten()
|
| 636 |
|
| 637 |
id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
|
| 638 |
-
matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
|
| 639 |
|
| 640 |
if not matching_bookmarks:
|
| 641 |
answer = "No relevant bookmarks found for your query."
|
|
@@ -655,30 +757,17 @@ Bookmarks:
|
|
| 655 |
Provide a concise and helpful response.
|
| 656 |
"""
|
| 657 |
|
| 658 |
-
def estimate_tokens(text):
|
| 659 |
-
return len(text) / 4
|
| 660 |
-
|
| 661 |
-
prompt_tokens = estimate_tokens(prompt)
|
| 662 |
-
max_tokens = 300
|
| 663 |
-
total_tokens = prompt_tokens + max_tokens
|
| 664 |
-
|
| 665 |
-
tokens_per_minute = 40000
|
| 666 |
-
tokens_per_second = tokens_per_minute / 60
|
| 667 |
-
required_delay = total_tokens / tokens_per_second
|
| 668 |
-
sleep_time = max(required_delay, 2)
|
| 669 |
-
|
| 670 |
response = openai.ChatCompletion.create(
|
| 671 |
-
model='llama-3.1-70b-versatile',
|
| 672 |
messages=[
|
| 673 |
{"role": "user", "content": prompt}
|
| 674 |
],
|
| 675 |
-
max_tokens=
|
| 676 |
temperature=0.7,
|
| 677 |
)
|
| 678 |
|
| 679 |
answer = response['choices'][0]['message']['content'].strip()
|
| 680 |
logger.info("Chatbot response generated")
|
| 681 |
-
time.sleep(sleep_time)
|
| 682 |
|
| 683 |
chat_history.append({"role": "assistant", "content": answer})
|
| 684 |
return chat_history
|
|
@@ -809,7 +898,7 @@ Navigate through the tabs to explore each feature in detail.
|
|
| 809 |
""")
|
| 810 |
|
| 811 |
manage_output = gr.Textbox(label="🔄 Status", interactive=False)
|
| 812 |
-
|
| 813 |
# CheckboxGroup for selecting bookmarks
|
| 814 |
bookmark_selector = gr.CheckboxGroup(
|
| 815 |
label="✅ Select Bookmarks",
|
|
@@ -870,8 +959,12 @@ Navigate through the tabs to explore each feature in detail.
|
|
| 870 |
logger.info("Launching Gradio app")
|
| 871 |
demo.launch(debug=True)
|
| 872 |
except Exception as e:
|
| 873 |
-
logger.error(f"Error building
|
| 874 |
-
print(f"Error building
|
| 875 |
|
| 876 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 877 |
build_app()
|
|
|
|
| 8 |
import requests
|
| 9 |
import time
|
| 10 |
import re
|
|
|
|
| 11 |
import logging
|
| 12 |
import os
|
| 13 |
import sys
|
|
|
|
|
|
|
| 14 |
import threading
|
| 15 |
+
from queue import Queue, Empty
|
| 16 |
+
import json
|
| 17 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 18 |
|
| 19 |
# Import OpenAI library
|
| 20 |
import openai
|
|
|
|
| 83 |
openai.api_key = GROQ_API_KEY
|
| 84 |
openai.api_base = "https://api.groq.com/openai/v1"
|
| 85 |
|
| 86 |
+
# Rate Limiter Configuration
|
| 87 |
+
RPM_LIMIT = 60 # Requests per minute (adjust based on your API's limit)
|
| 88 |
+
TPM_LIMIT = 60000 # Tokens per minute (adjust based on your API's limit)
|
| 89 |
+
BATCH_SIZE = 5 # Number of bookmarks per batch
|
| 90 |
+
|
| 91 |
+
# Implementing a Token Bucket Rate Limiter
|
| 92 |
+
class TokenBucket:
|
| 93 |
+
def __init__(self, rate, capacity):
|
| 94 |
+
self.rate = rate # tokens per second
|
| 95 |
+
self.capacity = capacity
|
| 96 |
+
self.tokens = capacity
|
| 97 |
+
self.timestamp = time.time()
|
| 98 |
+
self.lock = threading.Lock()
|
| 99 |
+
|
| 100 |
+
def consume(self, tokens=1):
|
| 101 |
+
with self.lock:
|
| 102 |
+
now = time.time()
|
| 103 |
+
elapsed = now - self.timestamp
|
| 104 |
+
refill = elapsed * self.rate
|
| 105 |
+
self.tokens = min(self.capacity, self.tokens + refill)
|
| 106 |
+
self.timestamp = now
|
| 107 |
+
if self.tokens >= tokens:
|
| 108 |
+
self.tokens -= tokens
|
| 109 |
+
return True
|
| 110 |
+
else:
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
def wait_for_token(self, tokens=1):
|
| 114 |
+
while not self.consume(tokens):
|
| 115 |
+
time.sleep(0.05)
|
| 116 |
+
|
| 117 |
+
# Initialize rate limiters
|
| 118 |
+
rpm_rate = RPM_LIMIT / 60 # tokens per second
|
| 119 |
+
tpm_rate = TPM_LIMIT / 60 # tokens per second
|
| 120 |
+
|
| 121 |
+
rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
|
| 122 |
+
tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
|
| 123 |
+
|
| 124 |
+
# Queue for LLM tasks
|
| 125 |
+
llm_queue = Queue()
|
| 126 |
+
|
| 127 |
+
def categorize_based_on_summary(summary, url):
|
| 128 |
+
"""
|
| 129 |
+
Assign category based on keywords in the summary or URL.
|
| 130 |
+
"""
|
| 131 |
+
summary_lower = summary.lower()
|
| 132 |
+
url_lower = url.lower()
|
| 133 |
+
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
| 134 |
+
return 'Social Media'
|
| 135 |
+
elif 'wikipedia' in url_lower:
|
| 136 |
+
return 'Reference and Knowledge Bases'
|
| 137 |
+
elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
|
| 138 |
+
return 'Technology'
|
| 139 |
+
elif 'news' in summary_lower or 'media' in summary_lower:
|
| 140 |
+
return 'News and Media'
|
| 141 |
+
elif 'education' in summary_lower or 'learning' in summary_lower:
|
| 142 |
+
return 'Education and Learning'
|
| 143 |
+
# Add more conditions as needed
|
| 144 |
+
else:
|
| 145 |
+
return 'Uncategorized'
|
| 146 |
+
|
| 147 |
+
def validate_category(bookmark):
|
| 148 |
+
"""
|
| 149 |
+
Further validate and adjust the category if needed.
|
| 150 |
+
"""
|
| 151 |
+
# Example: Specific cases based on URL
|
| 152 |
+
url_lower = bookmark['url'].lower()
|
| 153 |
+
if 'facebook' in url_lower or 'x.com' in url_lower:
|
| 154 |
+
return 'Social Media'
|
| 155 |
+
elif 'wikipedia' in url_lower:
|
| 156 |
+
return 'Reference and Knowledge Bases'
|
| 157 |
+
elif 'aws.amazon.com' in url_lower:
|
| 158 |
+
return 'Technology'
|
| 159 |
+
# Add more specific cases as needed
|
| 160 |
+
else:
|
| 161 |
+
return bookmark['category']
|
| 162 |
|
| 163 |
def extract_main_content(soup):
|
| 164 |
"""
|
|
|
|
| 229 |
|
| 230 |
return metadata
|
| 231 |
|
| 232 |
+
def llm_worker():
|
| 233 |
"""
|
| 234 |
+
Worker thread to process LLM tasks from the queue while respecting rate limits.
|
| 235 |
"""
|
| 236 |
+
logger.info("LLM worker started.")
|
| 237 |
+
while True:
|
| 238 |
+
batch = []
|
|
|
|
|
|
|
|
|
|
| 239 |
try:
|
| 240 |
+
# Collect bookmarks up to BATCH_SIZE
|
| 241 |
+
while len(batch) < BATCH_SIZE:
|
| 242 |
+
bookmark = llm_queue.get(timeout=1)
|
| 243 |
+
if bookmark is None:
|
| 244 |
+
# Shutdown signal
|
| 245 |
+
logger.info("LLM worker shutting down.")
|
| 246 |
+
return
|
| 247 |
+
if not bookmark.get('dead_link') and not bookmark.get('slow_link'):
|
| 248 |
+
batch.append(bookmark)
|
| 249 |
+
else:
|
| 250 |
+
# Skip processing for dead or slow links
|
| 251 |
+
bookmark['summary'] = 'No summary available.'
|
| 252 |
+
bookmark['category'] = 'Uncategorized'
|
| 253 |
+
llm_queue.task_done()
|
| 254 |
+
|
| 255 |
+
except Empty:
|
| 256 |
+
pass # No more bookmarks at the moment
|
| 257 |
+
|
| 258 |
+
if batch:
|
| 259 |
+
try:
|
| 260 |
+
# Rate Limiting
|
| 261 |
+
rpm_bucket.wait_for_token()
|
| 262 |
+
# Estimate tokens: prompt + max_tokens
|
| 263 |
+
# Here, we assume max_tokens=150 per bookmark
|
| 264 |
+
total_tokens = 150 * len(batch)
|
| 265 |
+
tpm_bucket.wait_for_token(tokens=total_tokens)
|
| 266 |
+
|
| 267 |
+
# Prepare prompt
|
| 268 |
+
prompt = "You are an assistant that creates concise webpage summaries and assigns categories.\n\n"
|
| 269 |
+
prompt += "Provide summaries and categories for the following bookmarks:\n\n"
|
| 270 |
+
|
| 271 |
+
for idx, bookmark in enumerate(batch, 1):
|
| 272 |
+
prompt += f"Bookmark {idx}:\nURL: {bookmark['url']}\nTitle: {bookmark['title']}\n\n"
|
| 273 |
+
|
| 274 |
+
# Corrected f-string without backslashes
|
| 275 |
+
prompt += f"Categories:\n{', '.join([f'\"{cat}\"' for cat in CATEGORIES])}\n\n"
|
| 276 |
+
|
| 277 |
+
prompt += "Format your response as a JSON object where each key is the bookmark URL and the value is another JSON object containing 'summary' and 'category'.\n\n"
|
| 278 |
+
prompt += "Example:\n"
|
| 279 |
+
prompt += "{\n"
|
| 280 |
+
prompt += " \"https://example.com\": {\n"
|
| 281 |
+
prompt += " \"summary\": \"This is an example summary.\",\n"
|
| 282 |
+
prompt += " \"category\": \"Technology\"\n"
|
| 283 |
+
prompt += " }\n"
|
| 284 |
+
prompt += "}\n\n"
|
| 285 |
+
prompt += "Now, provide the summaries and categories for the bookmarks listed above."
|
| 286 |
+
|
| 287 |
+
response = openai.ChatCompletion.create(
|
| 288 |
+
model='llama-3.1-70b-versatile', # Ensure this model is correct and available
|
| 289 |
+
messages=[
|
| 290 |
+
{"role": "user", "content": prompt}
|
| 291 |
+
],
|
| 292 |
+
max_tokens=150 * len(batch),
|
| 293 |
+
temperature=0.5,
|
| 294 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
+
content = response['choices'][0]['message']['content'].strip()
|
| 297 |
+
if not content:
|
| 298 |
+
raise ValueError("Empty response received from the model.")
|
| 299 |
+
|
| 300 |
+
# Parse JSON response
|
| 301 |
+
try:
|
| 302 |
+
json_response = json.loads(content)
|
| 303 |
+
for bookmark in batch:
|
| 304 |
+
url = bookmark['url']
|
| 305 |
+
if url in json_response:
|
| 306 |
+
summary = json_response[url].get('summary', '').strip()
|
| 307 |
+
category = json_response[url].get('category', '').strip()
|
| 308 |
+
|
| 309 |
+
if not summary:
|
| 310 |
+
summary = 'No summary available.'
|
| 311 |
+
bookmark['summary'] = summary
|
| 312 |
+
|
| 313 |
+
if category in CATEGORIES:
|
| 314 |
+
bookmark['category'] = category
|
| 315 |
+
else:
|
| 316 |
+
# Fallback to keyword-based categorization
|
| 317 |
+
bookmark['category'] = categorize_based_on_summary(summary, url)
|
| 318 |
+
else:
|
| 319 |
+
logger.warning(f"No data returned for {url}. Using fallback methods.")
|
| 320 |
+
bookmark['summary'] = 'No summary available.'
|
| 321 |
+
bookmark['category'] = 'Uncategorized'
|
| 322 |
+
|
| 323 |
+
# Additional keyword-based validation
|
| 324 |
+
bookmark['category'] = validate_category(bookmark)
|
| 325 |
+
|
| 326 |
+
logger.info(f"Processed bookmark: {url}")
|
| 327 |
+
|
| 328 |
+
except json.JSONDecodeError:
|
| 329 |
+
logger.error("Failed to parse JSON response from LLM. Using fallback methods.")
|
| 330 |
+
for bookmark in batch:
|
| 331 |
+
bookmark['summary'] = 'No summary available.'
|
| 332 |
+
bookmark['category'] = categorize_based_on_summary(bookmark.get('summary', ''), bookmark['url'])
|
| 333 |
+
bookmark['category'] = validate_category(bookmark)
|
| 334 |
+
|
| 335 |
+
except Exception as e:
|
| 336 |
+
logger.error(f"Error processing LLM response: {e}", exc_info=True)
|
| 337 |
+
for bookmark in batch:
|
| 338 |
+
bookmark['summary'] = 'No summary available.'
|
| 339 |
+
bookmark['category'] = 'Uncategorized'
|
| 340 |
+
|
| 341 |
+
except openai.error.RateLimitError as e:
|
| 342 |
+
logger.warning(f"LLM Rate limit reached. Retrying after 60 seconds.")
|
| 343 |
+
# Re-enqueue the entire batch for retry
|
| 344 |
+
for bookmark in batch:
|
| 345 |
+
llm_queue.put(bookmark)
|
| 346 |
+
time.sleep(60) # Wait before retrying
|
| 347 |
+
continue # Skip the rest and retry
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
logger.error(f"Error during LLM processing: {e}", exc_info=True)
|
| 351 |
+
for bookmark in batch:
|
| 352 |
+
bookmark['summary'] = 'No summary available.'
|
| 353 |
+
bookmark['category'] = 'Uncategorized'
|
| 354 |
|
| 355 |
+
finally:
|
| 356 |
+
# Mark all bookmarks in the batch as done
|
| 357 |
+
for _ in batch:
|
| 358 |
+
llm_queue.task_done()
|
| 359 |
|
| 360 |
+
def categorize_based_on_summary(summary, url):
|
| 361 |
+
"""
|
| 362 |
+
Assign category based on keywords in the summary or URL.
|
| 363 |
+
"""
|
| 364 |
+
summary_lower = summary.lower()
|
| 365 |
+
url_lower = url.lower()
|
| 366 |
+
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
| 367 |
+
return 'Social Media'
|
| 368 |
+
elif 'wikipedia' in url_lower:
|
| 369 |
+
return 'Reference and Knowledge Bases'
|
| 370 |
+
elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
|
| 371 |
+
return 'Technology'
|
| 372 |
+
elif 'news' in summary_lower or 'media' in summary_lower:
|
| 373 |
+
return 'News and Media'
|
| 374 |
+
elif 'education' in summary_lower or 'learning' in summary_lower:
|
| 375 |
+
return 'Education and Learning'
|
| 376 |
+
# Add more conditions as needed
|
| 377 |
+
else:
|
| 378 |
+
return 'Uncategorized'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
+
def validate_category(bookmark):
|
| 381 |
"""
|
| 382 |
+
Further validate and adjust the category if needed.
|
| 383 |
"""
|
| 384 |
+
# Example: Specific cases based on URL
|
| 385 |
+
url_lower = bookmark['url'].lower()
|
| 386 |
+
if 'facebook' in url_lower or 'x.com' in url_lower:
|
| 387 |
+
return 'Social Media'
|
| 388 |
+
elif 'wikipedia' in url_lower:
|
| 389 |
+
return 'Reference and Knowledge Bases'
|
| 390 |
+
elif 'aws.amazon.com' in url_lower:
|
| 391 |
+
return 'Technology'
|
| 392 |
+
# Add more specific cases as needed
|
| 393 |
+
else:
|
| 394 |
+
return bookmark['category']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
def fetch_url_info(bookmark):
|
| 397 |
"""
|
|
|
|
| 453 |
'slow_link': bookmark.get('slow_link', False),
|
| 454 |
}
|
| 455 |
|
| 456 |
+
def parse_bookmarks(file_content):
|
| 457 |
+
"""
|
| 458 |
+
Parse bookmarks from HTML file.
|
| 459 |
+
"""
|
| 460 |
+
logger.info("Parsing bookmarks")
|
| 461 |
+
try:
|
| 462 |
+
soup = BeautifulSoup(file_content, 'html.parser')
|
| 463 |
+
extracted_bookmarks = []
|
| 464 |
+
for link in soup.find_all('a'):
|
| 465 |
+
url = link.get('href')
|
| 466 |
+
title = link.text.strip()
|
| 467 |
+
if url and title:
|
| 468 |
+
if url.startswith('http://') or url.startswith('https://'):
|
| 469 |
+
extracted_bookmarks.append({'url': url, 'title': title})
|
| 470 |
+
else:
|
| 471 |
+
logger.info(f"Skipping non-http/https URL: {url}")
|
| 472 |
+
logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
|
| 473 |
+
return extracted_bookmarks
|
| 474 |
+
except Exception as e:
|
| 475 |
+
logger.error("Error parsing bookmarks: %s", e, exc_info=True)
|
| 476 |
+
raise
|
| 477 |
+
|
| 478 |
def vectorize_and_index(bookmarks_list):
|
| 479 |
"""
|
| 480 |
Create vector embeddings for bookmarks and build FAISS index with ID mapping.
|
|
|
|
| 546 |
logger.info("HTML display generated")
|
| 547 |
return cards
|
| 548 |
|
| 549 |
+
def generate_summary_and_assign_category(bookmark):
|
| 550 |
+
"""
|
| 551 |
+
Generate a concise summary and assign a category using a single LLM call.
|
| 552 |
+
This function is now handled by the LLM worker thread.
|
| 553 |
+
"""
|
| 554 |
+
# This function is now deprecated and handled by the worker thread.
|
| 555 |
+
pass
|
| 556 |
+
|
| 557 |
def process_uploaded_file(file, state_bookmarks):
|
| 558 |
"""
|
| 559 |
Process the uploaded bookmarks file.
|
|
|
|
| 590 |
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 591 |
executor.map(fetch_url_info, bookmarks)
|
| 592 |
|
| 593 |
+
# Enqueue bookmarks for LLM processing
|
| 594 |
+
logger.info("Enqueuing bookmarks for LLM processing")
|
| 595 |
+
for bookmark in bookmarks:
|
| 596 |
+
llm_queue.put(bookmark)
|
| 597 |
+
|
| 598 |
+
# Wait until all LLM tasks are completed
|
| 599 |
+
llm_queue.join()
|
| 600 |
+
logger.info("All LLM tasks have been processed")
|
| 601 |
|
| 602 |
try:
|
| 603 |
faiss_index = vectorize_and_index(bookmarks)
|
|
|
|
| 724 |
try:
|
| 725 |
chat_history.append({"role": "user", "content": user_query})
|
| 726 |
|
| 727 |
+
# Rate Limiting
|
| 728 |
+
rpm_bucket.wait_for_token()
|
| 729 |
+
# Estimate tokens: prompt + max_tokens
|
| 730 |
+
# Here, we assume max_tokens=300 per chatbot response
|
| 731 |
+
total_tokens = 300 # Adjust based on actual usage
|
| 732 |
+
tpm_bucket.wait_for_token(tokens=total_tokens)
|
|
|
|
|
|
|
|
|
|
| 733 |
|
| 734 |
query_vector = embedding_model.encode([user_query]).astype('float32')
|
| 735 |
k = 5
|
|
|
|
| 737 |
ids = ids.flatten()
|
| 738 |
|
| 739 |
id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
|
| 740 |
+
matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark and id_to_bookmark.get(id).get('summary')]
|
| 741 |
|
| 742 |
if not matching_bookmarks:
|
| 743 |
answer = "No relevant bookmarks found for your query."
|
|
|
|
| 757 |
Provide a concise and helpful response.
|
| 758 |
"""
|
| 759 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
response = openai.ChatCompletion.create(
|
| 761 |
+
model='llama-3.1-70b-versatile', # Ensure this model is correct and available
|
| 762 |
messages=[
|
| 763 |
{"role": "user", "content": prompt}
|
| 764 |
],
|
| 765 |
+
max_tokens=300,
|
| 766 |
temperature=0.7,
|
| 767 |
)
|
| 768 |
|
| 769 |
answer = response['choices'][0]['message']['content'].strip()
|
| 770 |
logger.info("Chatbot response generated")
|
|
|
|
| 771 |
|
| 772 |
chat_history.append({"role": "assistant", "content": answer})
|
| 773 |
return chat_history
|
|
|
|
| 898 |
""")
|
| 899 |
|
| 900 |
manage_output = gr.Textbox(label="🔄 Status", interactive=False)
|
| 901 |
+
|
| 902 |
# CheckboxGroup for selecting bookmarks
|
| 903 |
bookmark_selector = gr.CheckboxGroup(
|
| 904 |
label="✅ Select Bookmarks",
|
|
|
|
| 959 |
logger.info("Launching Gradio app")
|
| 960 |
demo.launch(debug=True)
|
| 961 |
except Exception as e:
|
| 962 |
+
logger.error(f"Error building Gradio app: {e}", exc_info=True)
|
| 963 |
+
print(f"Error building Gradio app: {e}")
|
| 964 |
|
| 965 |
if __name__ == "__main__":
|
| 966 |
+
# Start the LLM worker thread before launching the app
|
| 967 |
+
llm_thread = threading.Thread(target=llm_worker, daemon=True)
|
| 968 |
+
llm_thread.start()
|
| 969 |
+
|
| 970 |
build_app()
|