Spaces:

Al1Abdullah
/

atomcamp-chatbot

Sleeping

App Files Files Community

Al1Abdullah commited on Jul 9, 2025

Commit

bb30230

verified ·

1 Parent(s): edb4ebc

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -22

app.py CHANGED Viewed

@@ -6,6 +6,10 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 import requests
 load_dotenv()
@@ -22,14 +26,80 @@ def initialize_groq():
     groq_api_key = os.getenv("GROQ_API_KEY")
     return "Groq API key found" if groq_api_key else "Groq API key not found"
-def initialize_vectorstore():
-    global vectorstore, retriever
-    try:
-        vectorstore = FAISS.load_local("atomcamp_vector_db", embeddings, allow_dangerous_deserialization=True)
-        retriever = vectorstore.as_retriever()
-        return "Vectorstore loaded successfully"
-    except:
         sample_data = [
             {
                 "text": "Atomcamp is a leading data science education platform offering comprehensive courses in machine learning, Python programming, data analysis, and AI. We provide hands-on projects, expert mentorship, and career guidance to help students become successful data scientists.",
@@ -44,16 +114,33 @@ def initialize_vectorstore():
                 "url": "https://www.atomcamp.com/learning-paths"
             }
         ]
         docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in sample_data]
-        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-        chunks = splitter.split_documents(docs)
-        vectorstore = FAISS.from_documents(chunks, embeddings)
-        vectorstore.save_local("atomcamp_vector_db")
         retriever = vectorstore.as_retriever()
-        return "Sample vectorstore created successfully"
 def call_groq_api(message, context):
     global groq_api_key
@@ -102,9 +189,11 @@ def call_groq_api(message, context):
             result = response.json()
             return result["choices"][0]["message"]["content"]
         else:
             return None
     except Exception as e:
         return None
 def generate_response(message, context):
@@ -113,7 +202,7 @@ def generate_response(message, context):
     if groq_response:
         return groq_response
-    # Fallback responses
     message_lower = message.lower()
     if any(word in message_lower for word in ['course', 'courses', 'learn', 'study']):
@@ -795,13 +884,13 @@ def index():
                 if (line.trim().startsWith('•') || line.trim().startsWith('-')) {
                     return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
                         <span style="color: #16a34a; margin-right: 0.5rem; margin-top: 0.125rem; font-size: 0.875rem; font-weight: 500;">•</span>
-                        <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^[•-]\\s*/, '')}</span>
                     </div>`;
-                } else if (/^\\d+\\./.test(line.trim())) {
-                    const match = line.match(/^\\d+\\./);
                     return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
                         <span style="color: #16a34a; margin-right: 0.5rem; font-weight: 600; font-size: 0.875rem;">${match ? match[0] : ''}</span>
-                        <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^\\d+\\.\\s*/, '')}</span>
                     </div>`;
                 } else if (line.trim() === '') {
                     return '<br>';
@@ -852,16 +941,20 @@ def chat():
         if not message:
             return jsonify({'error': 'No message provided'}), 400
         if retriever:
             docs = retriever.get_relevant_documents(message)
-            context = "\n\n".join([doc.page_content for doc in docs[:3]])
         else:
             context = "I'm an AI assistant for Atomcamp, a data science education platform."
         response = generate_response(message, context)
         return jsonify({'response': response})
     except Exception as e:
         return jsonify({'error': f'Error: {str(e)}'}), 500
 @app.route('/static/<path:filename>')

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 import requests
+from bs4 import BeautifulSoup # New
+from collections import deque # New
+from urllib.parse import urljoin, urlparse # New
+import time # New for crawl delay
 load_dotenv()
     groq_api_key = os.getenv("GROQ_API_KEY")
     return "Groq API key found" if groq_api_key else "Groq API key not found"
+def crawl_website(url_to_crawl, max_pages=10):
+    """
+    Crawls a website to extract text content and links.
+    Args:
+        url_to_crawl (str): The starting URL for the crawl.
+        max_pages (int): Maximum number of pages to crawl.
+    Returns:
+        list: A list of dictionaries, each containing 'text' and 'url' of crawled pages.
+    """
+    base_domain = urlparse(url_to_crawl).netloc
+    queue = deque([url_to_crawl])
+    visited_urls = set()
+    scraped_data = []
+    print(f"Starting crawl from: {url_to_crawl}")
+    while queue and len(scraped_data) < max_pages:
+        current_url = queue.popleft()
+        if current_url in visited_urls:
+            continue
+        print(f"Crawling: {current_url}")
+        visited_urls.add(current_url)
+        try:
+            response = requests.get(current_url, timeout=10)
+            response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
+            soup = BeautifulSoup(response.text, 'lxml')
+            # Extract text content
+            page_text = ' '.join(p.get_text() for p in soup.find_all('p'))
+            page_text += ' '.join(li.get_text() for li in soup.find_all('li'))
+            page_text += ' '.join(h.get_text() for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))
+            # Clean up extra whitespace and newlines
+            page_text = ' '.join(page_text.split()).strip()
+            if page_text:
+                scraped_data.append({"text": page_text, "url": current_url})
+            # Extract links
+            for link in soup.find_all('a', href=True):
+                href = link['href']
+                absolute_url = urljoin(current_url, href)
+                parsed_absolute_url = urlparse(absolute_url)
+                # Only follow links within the same domain and not already visited
+                if parsed_absolute_url.netloc == base_domain and absolute_url not in visited_urls:
+                    if '#' not in absolute_url and 'mailto:' not in absolute_url and 'tel:' not in absolute_url: # Avoid anchor links and mail/tel links
+                        queue.append(absolute_url)
+        except requests.exceptions.RequestException as e:
+            print(f"Error crawling {current_url}: {e}")
+        except Exception as e:
+            print(f"An unexpected error occurred with {current_url}: {e}")
+        time.sleep(1) # Be polite and avoid overwhelming the server
+    print(f"Finished crawling. Scraped {len(scraped_data)} pages.")
+    return scraped_data
+def update_vectorstore_from_crawl(url_to_crawl="https://www.atomcamp.com/", max_pages=20):
+    """
+    Performs a web crawl and updates the FAISS vector store with the scraped data.
+    """
+    global vectorstore, retriever
+    print("Initiating website crawl for vector store update...")
+    scraped_data = crawl_website(url_to_crawl, max_pages)
+    if not scraped_data:
+        print("No data scraped from the website. Using sample data as fallback.")
+        # Fallback to sample data if crawl fails or yields no content
         sample_data = [
             {
                 "text": "Atomcamp is a leading data science education platform offering comprehensive courses in machine learning, Python programming, data analysis, and AI. We provide hands-on projects, expert mentorship, and career guidance to help students become successful data scientists.",
                 "url": "https://www.atomcamp.com/learning-paths"
             }
         ]
         docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in sample_data]
+    else:
+        docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in scraped_data]
+    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    chunks = splitter.split_documents(docs)
+    vectorstore = FAISS.from_documents(chunks, embeddings)
+    vectorstore.save_local("atomcamp_vector_db")
+    retriever = vectorstore.as_retriever()
+    print("Vectorstore updated successfully from crawled data (or sample fallback).")
+    return "Vectorstore updated successfully"
+def initialize_vectorstore():
+    global vectorstore, retriever
+    try:
+        # Attempt to load existing vectorstore
+        vectorstore = FAISS.load_local("atomcamp_vector_db", embeddings, allow_dangerous_deserialization=True)
         retriever = vectorstore.as_retriever()
+        print("Vectorstore loaded successfully from local storage.")
+        return "Vectorstore loaded successfully"
+    except Exception as e:
+        print(f"Failed to load vectorstore: {e}. Attempting to crawl and build.")
+        # If loading fails, crawl the website and build a new one
+        return update_vectorstore_from_crawl()
 def call_groq_api(message, context):
     global groq_api_key
             result = response.json()
             return result["choices"][0]["message"]["content"]
         else:
+            print(f"Groq API Error: {response.status_code} - {response.text}")
             return None
     except Exception as e:
+        print(f"Error calling Groq API: {e}")
         return None
 def generate_response(message, context):
     if groq_response:
         return groq_response
+    # Fallback responses (less likely to be hit with web scraping)
     message_lower = message.lower()
     if any(word in message_lower for word in ['course', 'courses', 'learn', 'study']):
                 if (line.trim().startsWith('•') || line.trim().startsWith('-')) {
                     return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
                         <span style="color: #16a34a; margin-right: 0.5rem; margin-top: 0.125rem; font-size: 0.875rem; font-weight: 500;">•</span>
+                        <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^[•-]\s*/, '')}</span>
                     </div>`;
+                } else if (/^\d+\./.test(line.trim())) {
+                    const match = line.match(/^\d+\./);
                     return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
                         <span style="color: #16a34a; margin-right: 0.5rem; font-weight: 600; font-size: 0.875rem;">${match ? match[0] : ''}</span>
+                        <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^\d+\.\s*/, '')}</span>
                     </div>`;
                 } else if (line.trim() === '') {
                     return '<br>';
         if not message:
             return jsonify({'error': 'No message provided'}), 400
+        # Retrieve relevant documents based on the user's query
         if retriever:
             docs = retriever.get_relevant_documents(message)
+            context = "\n\n".join([doc.page_content for doc in docs[:5]]) # Get top 5 relevant documents
+            print(f"Context from retriever: {context[:200]}...") # Print first 200 chars for debug
         else:
             context = "I'm an AI assistant for Atomcamp, a data science education platform."
+            print("Retriever not initialized, using default context.")
         response = generate_response(message, context)
         return jsonify({'response': response})
     except Exception as e:
+        print(f"Error in chat endpoint: {e}")
         return jsonify({'error': f'Error: {str(e)}'}), 500
 @app.route('/static/<path:filename>')