Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,10 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_core.documents import Document
|
| 8 |
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
|
@@ -22,14 +26,80 @@ def initialize_groq():
|
|
| 22 |
groq_api_key = os.getenv("GROQ_API_KEY")
|
| 23 |
return "Groq API key found" if groq_api_key else "Groq API key not found"
|
| 24 |
|
| 25 |
-
def
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
sample_data = [
|
| 34 |
{
|
| 35 |
"text": "Atomcamp is a leading data science education platform offering comprehensive courses in machine learning, Python programming, data analysis, and AI. We provide hands-on projects, expert mentorship, and career guidance to help students become successful data scientists.",
|
|
@@ -44,16 +114,33 @@ def initialize_vectorstore():
|
|
| 44 |
"url": "https://www.atomcamp.com/learning-paths"
|
| 45 |
}
|
| 46 |
]
|
| 47 |
-
|
| 48 |
docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in sample_data]
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
retriever = vectorstore.as_retriever()
|
| 55 |
-
|
| 56 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
def call_groq_api(message, context):
|
| 59 |
global groq_api_key
|
|
@@ -102,9 +189,11 @@ def call_groq_api(message, context):
|
|
| 102 |
result = response.json()
|
| 103 |
return result["choices"][0]["message"]["content"]
|
| 104 |
else:
|
|
|
|
| 105 |
return None
|
| 106 |
|
| 107 |
except Exception as e:
|
|
|
|
| 108 |
return None
|
| 109 |
|
| 110 |
def generate_response(message, context):
|
|
@@ -113,7 +202,7 @@ def generate_response(message, context):
|
|
| 113 |
if groq_response:
|
| 114 |
return groq_response
|
| 115 |
|
| 116 |
-
# Fallback responses
|
| 117 |
message_lower = message.lower()
|
| 118 |
|
| 119 |
if any(word in message_lower for word in ['course', 'courses', 'learn', 'study']):
|
|
@@ -795,13 +884,13 @@ def index():
|
|
| 795 |
if (line.trim().startsWith('•') || line.trim().startsWith('-')) {
|
| 796 |
return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
|
| 797 |
<span style="color: #16a34a; margin-right: 0.5rem; margin-top: 0.125rem; font-size: 0.875rem; font-weight: 500;">•</span>
|
| 798 |
-
<span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^[•-]
|
| 799 |
</div>`;
|
| 800 |
-
} else if (
|
| 801 |
-
const match = line.match(
|
| 802 |
return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
|
| 803 |
<span style="color: #16a34a; margin-right: 0.5rem; font-weight: 600; font-size: 0.875rem;">${match ? match[0] : ''}</span>
|
| 804 |
-
<span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(
|
| 805 |
</div>`;
|
| 806 |
} else if (line.trim() === '') {
|
| 807 |
return '<br>';
|
|
@@ -852,16 +941,20 @@ def chat():
|
|
| 852 |
if not message:
|
| 853 |
return jsonify({'error': 'No message provided'}), 400
|
| 854 |
|
|
|
|
| 855 |
if retriever:
|
| 856 |
docs = retriever.get_relevant_documents(message)
|
| 857 |
-
context = "\n\n".join([doc.page_content for doc in docs[:
|
|
|
|
| 858 |
else:
|
| 859 |
context = "I'm an AI assistant for Atomcamp, a data science education platform."
|
| 860 |
-
|
|
|
|
| 861 |
response = generate_response(message, context)
|
| 862 |
return jsonify({'response': response})
|
| 863 |
|
| 864 |
except Exception as e:
|
|
|
|
| 865 |
return jsonify({'error': f'Error: {str(e)}'}), 500
|
| 866 |
|
| 867 |
@app.route('/static/<path:filename>')
|
|
|
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_core.documents import Document
|
| 8 |
import requests
|
| 9 |
+
from bs4 import BeautifulSoup # New
|
| 10 |
+
from collections import deque # New
|
| 11 |
+
from urllib.parse import urljoin, urlparse # New
|
| 12 |
+
import time # New for crawl delay
|
| 13 |
|
| 14 |
load_dotenv()
|
| 15 |
|
|
|
|
| 26 |
groq_api_key = os.getenv("GROQ_API_KEY")
|
| 27 |
return "Groq API key found" if groq_api_key else "Groq API key not found"
|
| 28 |
|
| 29 |
+
def crawl_website(url_to_crawl, max_pages=10):
|
| 30 |
+
"""
|
| 31 |
+
Crawls a website to extract text content and links.
|
| 32 |
+
Args:
|
| 33 |
+
url_to_crawl (str): The starting URL for the crawl.
|
| 34 |
+
max_pages (int): Maximum number of pages to crawl.
|
| 35 |
+
Returns:
|
| 36 |
+
list: A list of dictionaries, each containing 'text' and 'url' of crawled pages.
|
| 37 |
+
"""
|
| 38 |
+
base_domain = urlparse(url_to_crawl).netloc
|
| 39 |
+
queue = deque([url_to_crawl])
|
| 40 |
+
visited_urls = set()
|
| 41 |
+
scraped_data = []
|
| 42 |
|
| 43 |
+
print(f"Starting crawl from: {url_to_crawl}")
|
| 44 |
+
|
| 45 |
+
while queue and len(scraped_data) < max_pages:
|
| 46 |
+
current_url = queue.popleft()
|
| 47 |
+
|
| 48 |
+
if current_url in visited_urls:
|
| 49 |
+
continue
|
| 50 |
+
|
| 51 |
+
print(f"Crawling: {current_url}")
|
| 52 |
+
visited_urls.add(current_url)
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
response = requests.get(current_url, timeout=10)
|
| 56 |
+
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
| 57 |
+
soup = BeautifulSoup(response.text, 'lxml')
|
| 58 |
+
|
| 59 |
+
# Extract text content
|
| 60 |
+
page_text = ' '.join(p.get_text() for p in soup.find_all('p'))
|
| 61 |
+
page_text += ' '.join(li.get_text() for li in soup.find_all('li'))
|
| 62 |
+
page_text += ' '.join(h.get_text() for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))
|
| 63 |
+
|
| 64 |
+
# Clean up extra whitespace and newlines
|
| 65 |
+
page_text = ' '.join(page_text.split()).strip()
|
| 66 |
+
|
| 67 |
+
if page_text:
|
| 68 |
+
scraped_data.append({"text": page_text, "url": current_url})
|
| 69 |
+
|
| 70 |
+
# Extract links
|
| 71 |
+
for link in soup.find_all('a', href=True):
|
| 72 |
+
href = link['href']
|
| 73 |
+
absolute_url = urljoin(current_url, href)
|
| 74 |
+
parsed_absolute_url = urlparse(absolute_url)
|
| 75 |
+
|
| 76 |
+
# Only follow links within the same domain and not already visited
|
| 77 |
+
if parsed_absolute_url.netloc == base_domain and absolute_url not in visited_urls:
|
| 78 |
+
if '#' not in absolute_url and 'mailto:' not in absolute_url and 'tel:' not in absolute_url: # Avoid anchor links and mail/tel links
|
| 79 |
+
queue.append(absolute_url)
|
| 80 |
+
|
| 81 |
+
except requests.exceptions.RequestException as e:
|
| 82 |
+
print(f"Error crawling {current_url}: {e}")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"An unexpected error occurred with {current_url}: {e}")
|
| 85 |
+
|
| 86 |
+
time.sleep(1) # Be polite and avoid overwhelming the server
|
| 87 |
+
|
| 88 |
+
print(f"Finished crawling. Scraped {len(scraped_data)} pages.")
|
| 89 |
+
return scraped_data
|
| 90 |
+
|
| 91 |
+
def update_vectorstore_from_crawl(url_to_crawl="https://www.atomcamp.com/", max_pages=20):
|
| 92 |
+
"""
|
| 93 |
+
Performs a web crawl and updates the FAISS vector store with the scraped data.
|
| 94 |
+
"""
|
| 95 |
+
global vectorstore, retriever
|
| 96 |
+
|
| 97 |
+
print("Initiating website crawl for vector store update...")
|
| 98 |
+
scraped_data = crawl_website(url_to_crawl, max_pages)
|
| 99 |
+
|
| 100 |
+
if not scraped_data:
|
| 101 |
+
print("No data scraped from the website. Using sample data as fallback.")
|
| 102 |
+
# Fallback to sample data if crawl fails or yields no content
|
| 103 |
sample_data = [
|
| 104 |
{
|
| 105 |
"text": "Atomcamp is a leading data science education platform offering comprehensive courses in machine learning, Python programming, data analysis, and AI. We provide hands-on projects, expert mentorship, and career guidance to help students become successful data scientists.",
|
|
|
|
| 114 |
"url": "https://www.atomcamp.com/learning-paths"
|
| 115 |
}
|
| 116 |
]
|
|
|
|
| 117 |
docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in sample_data]
|
| 118 |
+
else:
|
| 119 |
+
docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in scraped_data]
|
| 120 |
+
|
| 121 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 122 |
+
chunks = splitter.split_documents(docs)
|
| 123 |
+
|
| 124 |
+
vectorstore = FAISS.from_documents(chunks, embeddings)
|
| 125 |
+
vectorstore.save_local("atomcamp_vector_db")
|
| 126 |
+
retriever = vectorstore.as_retriever()
|
| 127 |
+
print("Vectorstore updated successfully from crawled data (or sample fallback).")
|
| 128 |
+
return "Vectorstore updated successfully"
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def initialize_vectorstore():
|
| 132 |
+
global vectorstore, retriever
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
# Attempt to load existing vectorstore
|
| 136 |
+
vectorstore = FAISS.load_local("atomcamp_vector_db", embeddings, allow_dangerous_deserialization=True)
|
| 137 |
retriever = vectorstore.as_retriever()
|
| 138 |
+
print("Vectorstore loaded successfully from local storage.")
|
| 139 |
+
return "Vectorstore loaded successfully"
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print(f"Failed to load vectorstore: {e}. Attempting to crawl and build.")
|
| 142 |
+
# If loading fails, crawl the website and build a new one
|
| 143 |
+
return update_vectorstore_from_crawl()
|
| 144 |
|
| 145 |
def call_groq_api(message, context):
|
| 146 |
global groq_api_key
|
|
|
|
| 189 |
result = response.json()
|
| 190 |
return result["choices"][0]["message"]["content"]
|
| 191 |
else:
|
| 192 |
+
print(f"Groq API Error: {response.status_code} - {response.text}")
|
| 193 |
return None
|
| 194 |
|
| 195 |
except Exception as e:
|
| 196 |
+
print(f"Error calling Groq API: {e}")
|
| 197 |
return None
|
| 198 |
|
| 199 |
def generate_response(message, context):
|
|
|
|
| 202 |
if groq_response:
|
| 203 |
return groq_response
|
| 204 |
|
| 205 |
+
# Fallback responses (less likely to be hit with web scraping)
|
| 206 |
message_lower = message.lower()
|
| 207 |
|
| 208 |
if any(word in message_lower for word in ['course', 'courses', 'learn', 'study']):
|
|
|
|
| 884 |
if (line.trim().startsWith('•') || line.trim().startsWith('-')) {
|
| 885 |
return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
|
| 886 |
<span style="color: #16a34a; margin-right: 0.5rem; margin-top: 0.125rem; font-size: 0.875rem; font-weight: 500;">•</span>
|
| 887 |
+
<span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^[•-]\s*/, '')}</span>
|
| 888 |
</div>`;
|
| 889 |
+
} else if (/^\d+\./.test(line.trim())) {
|
| 890 |
+
const match = line.match(/^\d+\./);
|
| 891 |
return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
|
| 892 |
<span style="color: #16a34a; margin-right: 0.5rem; font-weight: 600; font-size: 0.875rem;">${match ? match[0] : ''}</span>
|
| 893 |
+
<span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^\d+\.\s*/, '')}</span>
|
| 894 |
</div>`;
|
| 895 |
} else if (line.trim() === '') {
|
| 896 |
return '<br>';
|
|
|
|
| 941 |
if not message:
|
| 942 |
return jsonify({'error': 'No message provided'}), 400
|
| 943 |
|
| 944 |
+
# Retrieve relevant documents based on the user's query
|
| 945 |
if retriever:
|
| 946 |
docs = retriever.get_relevant_documents(message)
|
| 947 |
+
context = "\n\n".join([doc.page_content for doc in docs[:5]]) # Get top 5 relevant documents
|
| 948 |
+
print(f"Context from retriever: {context[:200]}...") # Print first 200 chars for debug
|
| 949 |
else:
|
| 950 |
context = "I'm an AI assistant for Atomcamp, a data science education platform."
|
| 951 |
+
print("Retriever not initialized, using default context.")
|
| 952 |
+
|
| 953 |
response = generate_response(message, context)
|
| 954 |
return jsonify({'response': response})
|
| 955 |
|
| 956 |
except Exception as e:
|
| 957 |
+
print(f"Error in chat endpoint: {e}")
|
| 958 |
return jsonify({'error': f'Error: {str(e)}'}), 500
|
| 959 |
|
| 960 |
@app.route('/static/<path:filename>')
|