Spaces:

siddhartharya
/

Bookmark-Manager

Paused

App Files Files Community

siddhartharya commited on Nov 26, 2024

Commit

31c955d

verified ·

1 Parent(s): 7391b3d

Update app.py

Browse files

Files changed (1) hide show

app.py +287 -194

app.py CHANGED Viewed

@@ -8,13 +8,13 @@ import numpy as np
 import requests
 import time
 import re
-import base64
 import logging
 import os
 import sys
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor
 import threading
 # Import OpenAI library
 import openai
@@ -83,9 +83,82 @@ if not GROQ_API_KEY:
 openai.api_key = GROQ_API_KEY
 openai.api_base = "https://api.groq.com/openai/v1"
-# Initialize global variables for rate limiting
-api_lock = threading.Lock()
-last_api_call_time = 0
 def extract_main_content(soup):
     """
@@ -156,171 +229,169 @@ def get_page_metadata(soup):
     return metadata
-def generate_summary_and_assign_category(bookmark):
     """
-    Generate a concise summary and assign a category using a single LLM call.
     """
-    logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
-    max_retries = 3
-    retry_count = 0
-    while retry_count < max_retries:
         try:
-            # Rate Limiting Logic
-            with api_lock:
-                global last_api_call_time
-                current_time = time.time()
-                elapsed = current_time - last_api_call_time
-                if elapsed < 2:
-                    sleep_duration = 2 - elapsed
-                    logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
-                    time.sleep(sleep_duration)
-                last_api_call_time = time.time()
-            html_content = bookmark.get('html_content', '')
-            soup = BeautifulSoup(html_content, 'html.parser')
-            metadata = get_page_metadata(soup)
-            main_content = extract_main_content(soup)
-            # Prepare content for the prompt
-            content_parts = []
-            if metadata['title']:
-                content_parts.append(f"Title: {metadata['title']}")
-            if metadata['description']:
-                content_parts.append(f"Description: {metadata['description']}")
-            if metadata['keywords']:
-                content_parts.append(f"Keywords: {metadata['keywords']}")
-            if main_content:
-                content_parts.append(f"Main Content: {main_content}")
-            content_text = '\n'.join(content_parts)
-            # Detect insufficient or erroneous content
-            error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
-            if not content_text or len(content_text.split()) < 50:
-                use_prior_knowledge = True
-                logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
-            elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
-                use_prior_knowledge = True
-                logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
-            else:
-                use_prior_knowledge = False
-            if use_prior_knowledge:
-                prompt = f"""
-You are a knowledgeable assistant with up-to-date information as of 2023.
-URL: {bookmark.get('url')}
-Provide:
-1. A concise summary (max two sentences) about this website.
-2. Assign the most appropriate category from the list below.
-Categories:
-{', '.join([f'"{cat}"' for cat in CATEGORIES])}
-Format:
-Summary: [Your summary]
-Category: [One category]
-"""
-            else:
-                prompt = f"""
-You are an assistant that creates concise webpage summaries and assigns categories.
-Content:
-{content_text}
-Provide:
-1. A concise summary (max two sentences) focusing on the main topic.
-2. Assign the most appropriate category from the list below.
-Categories:
-{', '.join([f'"{cat}"' for cat in CATEGORIES])}
-Format:
-Summary: [Your summary]
-Category: [One category]
-"""
-            def estimate_tokens(text):
-                return len(text) / 4
-            prompt_tokens = estimate_tokens(prompt)
-            max_tokens = 150
-            total_tokens = prompt_tokens + max_tokens
-            tokens_per_minute = 40000
-            tokens_per_second = tokens_per_minute / 60
-            required_delay = total_tokens / tokens_per_second
-            sleep_time = max(required_delay, 2)
-            response = openai.ChatCompletion.create(
-                model='llama-3.1-70b-versatile',
-                messages=[
-                    {"role": "user", "content": prompt}
-                ],
-                max_tokens=int(max_tokens),
-                temperature=0.5,
-            )
-            content = response['choices'][0]['message']['content'].strip()
-            if not content:
-                raise ValueError("Empty response received from the model.")
-            summary_match = re.search(r"Summary:\s*(.*)", content)
-            category_match = re.search(r"Category:\s*(.*)", content)
-            if summary_match:
-                bookmark['summary'] = summary_match.group(1).strip()
-            else:
-                bookmark['summary'] = 'No summary available.'
-            if category_match:
-                category = category_match.group(1).strip().strip('"')
-                if category in CATEGORIES:
-                    bookmark['category'] = category
-                else:
-                    bookmark['category'] = 'Uncategorized'
-            else:
-                bookmark['category'] = 'Uncategorized'
-            # Simple keyword-based validation
-            summary_lower = bookmark['summary'].lower()
-            url_lower = bookmark['url'].lower()
-            if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
-                bookmark['category'] = 'Social Media'
-            elif 'wikipedia' in url_lower:
-                bookmark['category'] = 'Reference and Knowledge Bases'
-            logger.info("Successfully generated summary and assigned category")
-            time.sleep(sleep_time)
-            break
-        except openai.error.RateLimitError as e:
-            retry_count += 1
-            wait_time = int(e.headers.get("Retry-After", 5))
-            logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
-            time.sleep(wait_time)
-        except Exception as e:
-            logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
-            bookmark['summary'] = 'No summary available.'
-            bookmark['category'] = 'Uncategorized'
-            break
-def parse_bookmarks(file_content):
     """
-    Parse bookmarks from HTML file.
     """
-    logger.info("Parsing bookmarks")
-    try:
-        soup = BeautifulSoup(file_content, 'html.parser')
-        extracted_bookmarks = []
-        for link in soup.find_all('a'):
-            url = link.get('href')
-            title = link.text.strip()
-            if url and title:
-                if url.startswith('http://') or url.startswith('https://'):
-                    extracted_bookmarks.append({'url': url, 'title': title})
-                else:
-                    logger.info(f"Skipping non-http/https URL: {url}")
-        logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
-        return extracted_bookmarks
-    except Exception as e:
-        logger.error("Error parsing bookmarks: %s", e, exc_info=True)
-        raise
 def fetch_url_info(bookmark):
     """
@@ -382,6 +453,28 @@ def fetch_url_info(bookmark):
                 'slow_link': bookmark.get('slow_link', False),
             }
 def vectorize_and_index(bookmarks_list):
     """
     Create vector embeddings for bookmarks and build FAISS index with ID mapping.
@@ -453,6 +546,14 @@ def display_bookmarks():
     logger.info("HTML display generated")
     return cards
 def process_uploaded_file(file, state_bookmarks):
     """
     Process the uploaded bookmarks file.
@@ -489,10 +590,14 @@ def process_uploaded_file(file, state_bookmarks):
     with ThreadPoolExecutor(max_workers=10) as executor:
         executor.map(fetch_url_info, bookmarks)
-    # Process bookmarks concurrently with LLM calls
-    logger.info("Processing bookmarks with LLM concurrently")
-    with ThreadPoolExecutor(max_workers=1) as executor:
-        executor.map(generate_summary_and_assign_category, bookmarks)
     try:
         faiss_index = vectorize_and_index(bookmarks)
@@ -619,15 +724,12 @@ def chatbot_response(user_query, chat_history):
     try:
         chat_history.append({"role": "user", "content": user_query})
-        with api_lock:
-            global last_api_call_time
-            current_time = time.time()
-            elapsed = current_time - last_api_call_time
-            if elapsed < 2:
-                sleep_duration = 2 - elapsed
-                logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
-                time.sleep(sleep_duration)
-            last_api_call_time = time.time()
         query_vector = embedding_model.encode([user_query]).astype('float32')
         k = 5
@@ -635,7 +737,7 @@ def chatbot_response(user_query, chat_history):
         ids = ids.flatten()
         id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
-        matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
         if not matching_bookmarks:
             answer = "No relevant bookmarks found for your query."
@@ -655,30 +757,17 @@ Bookmarks:
 Provide a concise and helpful response.
 """
-        def estimate_tokens(text):
-            return len(text) / 4
-        prompt_tokens = estimate_tokens(prompt)
-        max_tokens = 300
-        total_tokens = prompt_tokens + max_tokens
-        tokens_per_minute = 40000
-        tokens_per_second = tokens_per_minute / 60
-        required_delay = total_tokens / tokens_per_second
-        sleep_time = max(required_delay, 2)
         response = openai.ChatCompletion.create(
-            model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],
-            max_tokens=int(max_tokens),
             temperature=0.7,
         )
         answer = response['choices'][0]['message']['content'].strip()
         logger.info("Chatbot response generated")
-        time.sleep(sleep_time)
         chat_history.append({"role": "assistant", "content": answer})
         return chat_history
@@ -809,7 +898,7 @@ Navigate through the tabs to explore each feature in detail.
 """)
                 manage_output = gr.Textbox(label="🔄 Status", interactive=False)
                 # CheckboxGroup for selecting bookmarks
                 bookmark_selector = gr.CheckboxGroup(
                     label="✅ Select Bookmarks",
@@ -870,8 +959,12 @@ Navigate through the tabs to explore each feature in detail.
         logger.info("Launching Gradio app")
         demo.launch(debug=True)
     except Exception as e:
-        logger.error(f"Error building the app: {e}", exc_info=True)
-        print(f"Error building the app: {e}")
 if __name__ == "__main__":
     build_app()

 import requests
 import time
 import re
 import logging
 import os
 import sys
 import threading
+from queue import Queue, Empty
+import json
+from concurrent.futures import ThreadPoolExecutor
 # Import OpenAI library
 import openai
 openai.api_key = GROQ_API_KEY
 openai.api_base = "https://api.groq.com/openai/v1"
+# Rate Limiter Configuration
+RPM_LIMIT = 60       # Requests per minute (adjust based on your API's limit)
+TPM_LIMIT = 60000    # Tokens per minute (adjust based on your API's limit)
+BATCH_SIZE = 5       # Number of bookmarks per batch
+# Implementing a Token Bucket Rate Limiter
+class TokenBucket:
+    def __init__(self, rate, capacity):
+        self.rate = rate  # tokens per second
+        self.capacity = capacity
+        self.tokens = capacity
+        self.timestamp = time.time()
+        self.lock = threading.Lock()
+    def consume(self, tokens=1):
+        with self.lock:
+            now = time.time()
+            elapsed = now - self.timestamp
+            refill = elapsed * self.rate
+            self.tokens = min(self.capacity, self.tokens + refill)
+            self.timestamp = now
+            if self.tokens >= tokens:
+                self.tokens -= tokens
+                return True
+            else:
+                return False
+    def wait_for_token(self, tokens=1):
+        while not self.consume(tokens):
+            time.sleep(0.05)
+# Initialize rate limiters
+rpm_rate = RPM_LIMIT / 60  # tokens per second
+tpm_rate = TPM_LIMIT / 60  # tokens per second
+rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
+tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
+# Queue for LLM tasks
+llm_queue = Queue()
+def categorize_based_on_summary(summary, url):
+    """
+    Assign category based on keywords in the summary or URL.
+    """
+    summary_lower = summary.lower()
+    url_lower = url.lower()
+    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
+        return 'Technology'
+    elif 'news' in summary_lower or 'media' in summary_lower:
+        return 'News and Media'
+    elif 'education' in summary_lower or 'learning' in summary_lower:
+        return 'Education and Learning'
+    # Add more conditions as needed
+    else:
+        return 'Uncategorized'
+def validate_category(bookmark):
+    """
+    Further validate and adjust the category if needed.
+    """
+    # Example: Specific cases based on URL
+    url_lower = bookmark['url'].lower()
+    if 'facebook' in url_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'aws.amazon.com' in url_lower:
+        return 'Technology'
+    # Add more specific cases as needed
+    else:
+        return bookmark['category']
 def extract_main_content(soup):
     """
     return metadata
+def llm_worker():
     """
+    Worker thread to process LLM tasks from the queue while respecting rate limits.
     """
+    logger.info("LLM worker started.")
+    while True:
+        batch = []
         try:
+            # Collect bookmarks up to BATCH_SIZE
+            while len(batch) < BATCH_SIZE:
+                bookmark = llm_queue.get(timeout=1)
+                if bookmark is None:
+                    # Shutdown signal
+                    logger.info("LLM worker shutting down.")
+                    return
+                if not bookmark.get('dead_link') and not bookmark.get('slow_link'):
+                    batch.append(bookmark)
+                else:
+                    # Skip processing for dead or slow links
+                    bookmark['summary'] = 'No summary available.'
+                    bookmark['category'] = 'Uncategorized'
+                    llm_queue.task_done()
+        except Empty:
+            pass  # No more bookmarks at the moment
+        if batch:
+            try:
+                # Rate Limiting
+                rpm_bucket.wait_for_token()
+                # Estimate tokens: prompt + max_tokens
+                # Here, we assume max_tokens=150 per bookmark
+                total_tokens = 150 * len(batch)
+                tpm_bucket.wait_for_token(tokens=total_tokens)
+                # Prepare prompt
+                prompt = "You are an assistant that creates concise webpage summaries and assigns categories.\n\n"
+                prompt += "Provide summaries and categories for the following bookmarks:\n\n"
+                for idx, bookmark in enumerate(batch, 1):
+                    prompt += f"Bookmark {idx}:\nURL: {bookmark['url']}\nTitle: {bookmark['title']}\n\n"
+                # Corrected f-string without backslashes
+                prompt += f"Categories:\n{', '.join([f'\"{cat}\"' for cat in CATEGORIES])}\n\n"
+                prompt += "Format your response as a JSON object where each key is the bookmark URL and the value is another JSON object containing 'summary' and 'category'.\n\n"
+                prompt += "Example:\n"
+                prompt += "{\n"
+                prompt += "  \"https://example.com\": {\n"
+                prompt += "    \"summary\": \"This is an example summary.\",\n"
+                prompt += "    \"category\": \"Technology\"\n"
+                prompt += "  }\n"
+                prompt += "}\n\n"
+                prompt += "Now, provide the summaries and categories for the bookmarks listed above."
+                response = openai.ChatCompletion.create(
+                    model='llama-3.1-70b-versatile',  # Ensure this model is correct and available
+                    messages=[
+                        {"role": "user", "content": prompt}
+                    ],
+                    max_tokens=150 * len(batch),
+                    temperature=0.5,
+                )
+                content = response['choices'][0]['message']['content'].strip()
+                if not content:
+                    raise ValueError("Empty response received from the model.")
+                # Parse JSON response
+                try:
+                    json_response = json.loads(content)
+                    for bookmark in batch:
+                        url = bookmark['url']
+                        if url in json_response:
+                            summary = json_response[url].get('summary', '').strip()
+                            category = json_response[url].get('category', '').strip()
+                            if not summary:
+                                summary = 'No summary available.'
+                            bookmark['summary'] = summary
+                            if category in CATEGORIES:
+                                bookmark['category'] = category
+                            else:
+                                # Fallback to keyword-based categorization
+                                bookmark['category'] = categorize_based_on_summary(summary, url)
+                        else:
+                            logger.warning(f"No data returned for {url}. Using fallback methods.")
+                            bookmark['summary'] = 'No summary available.'
+                            bookmark['category'] = 'Uncategorized'
+                        # Additional keyword-based validation
+                        bookmark['category'] = validate_category(bookmark)
+                        logger.info(f"Processed bookmark: {url}")
+                except json.JSONDecodeError:
+                    logger.error("Failed to parse JSON response from LLM. Using fallback methods.")
+                    for bookmark in batch:
+                        bookmark['summary'] = 'No summary available.'
+                        bookmark['category'] = categorize_based_on_summary(bookmark.get('summary', ''), bookmark['url'])
+                        bookmark['category'] = validate_category(bookmark)
+                except Exception as e:
+                    logger.error(f"Error processing LLM response: {e}", exc_info=True)
+                    for bookmark in batch:
+                        bookmark['summary'] = 'No summary available.'
+                        bookmark['category'] = 'Uncategorized'
+            except openai.error.RateLimitError as e:
+                logger.warning(f"LLM Rate limit reached. Retrying after 60 seconds.")
+                # Re-enqueue the entire batch for retry
+                for bookmark in batch:
+                    llm_queue.put(bookmark)
+                time.sleep(60)  # Wait before retrying
+                continue  # Skip the rest and retry
+            except Exception as e:
+                logger.error(f"Error during LLM processing: {e}", exc_info=True)
+                for bookmark in batch:
+                    bookmark['summary'] = 'No summary available.'
+                    bookmark['category'] = 'Uncategorized'
+            finally:
+                # Mark all bookmarks in the batch as done
+                for _ in batch:
+                    llm_queue.task_done()
+def categorize_based_on_summary(summary, url):
+    """
+    Assign category based on keywords in the summary or URL.
+    """
+    summary_lower = summary.lower()
+    url_lower = url.lower()
+    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
+        return 'Technology'
+    elif 'news' in summary_lower or 'media' in summary_lower:
+        return 'News and Media'
+    elif 'education' in summary_lower or 'learning' in summary_lower:
+        return 'Education and Learning'
+    # Add more conditions as needed
+    else:
+        return 'Uncategorized'
+def validate_category(bookmark):
     """
+    Further validate and adjust the category if needed.
     """
+    # Example: Specific cases based on URL
+    url_lower = bookmark['url'].lower()
+    if 'facebook' in url_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'aws.amazon.com' in url_lower:
+        return 'Technology'
+    # Add more specific cases as needed
+    else:
+        return bookmark['category']
 def fetch_url_info(bookmark):
     """
                 'slow_link': bookmark.get('slow_link', False),
             }
+def parse_bookmarks(file_content):
+    """
+    Parse bookmarks from HTML file.
+    """
+    logger.info("Parsing bookmarks")
+    try:
+        soup = BeautifulSoup(file_content, 'html.parser')
+        extracted_bookmarks = []
+        for link in soup.find_all('a'):
+            url = link.get('href')
+            title = link.text.strip()
+            if url and title:
+                if url.startswith('http://') or url.startswith('https://'):
+                    extracted_bookmarks.append({'url': url, 'title': title})
+                else:
+                    logger.info(f"Skipping non-http/https URL: {url}")
+        logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
+        return extracted_bookmarks
+    except Exception as e:
+        logger.error("Error parsing bookmarks: %s", e, exc_info=True)
+        raise
 def vectorize_and_index(bookmarks_list):
     """
     Create vector embeddings for bookmarks and build FAISS index with ID mapping.
     logger.info("HTML display generated")
     return cards
+def generate_summary_and_assign_category(bookmark):
+    """
+    Generate a concise summary and assign a category using a single LLM call.
+    This function is now handled by the LLM worker thread.
+    """
+    # This function is now deprecated and handled by the worker thread.
+    pass
 def process_uploaded_file(file, state_bookmarks):
     """
     Process the uploaded bookmarks file.
     with ThreadPoolExecutor(max_workers=10) as executor:
         executor.map(fetch_url_info, bookmarks)
+    # Enqueue bookmarks for LLM processing
+    logger.info("Enqueuing bookmarks for LLM processing")
+    for bookmark in bookmarks:
+        llm_queue.put(bookmark)
+    # Wait until all LLM tasks are completed
+    llm_queue.join()
+    logger.info("All LLM tasks have been processed")
     try:
         faiss_index = vectorize_and_index(bookmarks)
     try:
         chat_history.append({"role": "user", "content": user_query})
+        # Rate Limiting
+        rpm_bucket.wait_for_token()
+        # Estimate tokens: prompt + max_tokens
+        # Here, we assume max_tokens=300 per chatbot response
+        total_tokens = 300  # Adjust based on actual usage
+        tpm_bucket.wait_for_token(tokens=total_tokens)
         query_vector = embedding_model.encode([user_query]).astype('float32')
         k = 5
         ids = ids.flatten()
         id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
+        matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark and id_to_bookmark.get(id).get('summary')]
         if not matching_bookmarks:
             answer = "No relevant bookmarks found for your query."
 Provide a concise and helpful response.
 """
         response = openai.ChatCompletion.create(
+            model='llama-3.1-70b-versatile',  # Ensure this model is correct and available
             messages=[
                 {"role": "user", "content": prompt}
             ],
+            max_tokens=300,
             temperature=0.7,
         )
         answer = response['choices'][0]['message']['content'].strip()
         logger.info("Chatbot response generated")
         chat_history.append({"role": "assistant", "content": answer})
         return chat_history
 """)
                 manage_output = gr.Textbox(label="🔄 Status", interactive=False)
                 # CheckboxGroup for selecting bookmarks
                 bookmark_selector = gr.CheckboxGroup(
                     label="✅ Select Bookmarks",
         logger.info("Launching Gradio app")
         demo.launch(debug=True)
     except Exception as e:
+        logger.error(f"Error building Gradio app: {e}", exc_info=True)
+        print(f"Error building Gradio app: {e}")
 if __name__ == "__main__":
+    # Start the LLM worker thread before launching the app
+    llm_thread = threading.Thread(target=llm_worker, daemon=True)
+    llm_thread.start()
     build_app()