from flask import Flask, render_template_string, request, jsonify, send_from_directory import os from dotenv import load_dotenv from langchain_community.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document import requests from bs4 import BeautifulSoup # New from collections import deque # New from urllib.parse import urljoin, urlparse # New import time # New for crawl delay load_dotenv() app = Flask(__name__) # Global variables embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = None retriever = None groq_api_key = None def initialize_groq(): global groq_api_key groq_api_key = os.getenv("GROQ_API_KEY") return "Groq API key found" if groq_api_key else "Groq API key not found" def crawl_website(url_to_crawl, max_pages=10): """ Crawls a website to extract text content and links. Args: url_to_crawl (str): The starting URL for the crawl. max_pages (int): Maximum number of pages to crawl. Returns: list: A list of dictionaries, each containing 'text' and 'url' of crawled pages. """ base_domain = urlparse(url_to_crawl).netloc queue = deque([url_to_crawl]) visited_urls = set() scraped_data = [] print(f"Starting crawl from: {url_to_crawl}") while queue and len(scraped_data) < max_pages: current_url = queue.popleft() if current_url in visited_urls: continue print(f"Crawling: {current_url}") visited_urls.add(current_url) try: response = requests.get(current_url, timeout=10) response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx) soup = BeautifulSoup(response.text, 'lxml') # Extract text content page_text = ' '.join(p.get_text() for p in soup.find_all('p')) page_text += ' '.join(li.get_text() for li in soup.find_all('li')) page_text += ' '.join(h.get_text() for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])) # Clean up extra whitespace and newlines page_text = ' '.join(page_text.split()).strip() if page_text: scraped_data.append({"text": page_text, "url": current_url}) # Extract links for link in soup.find_all('a', href=True): href = link['href'] absolute_url = urljoin(current_url, href) parsed_absolute_url = urlparse(absolute_url) # Only follow links within the same domain and not already visited if parsed_absolute_url.netloc == base_domain and absolute_url not in visited_urls: if '#' not in absolute_url and 'mailto:' not in absolute_url and 'tel:' not in absolute_url: # Avoid anchor links and mail/tel links queue.append(absolute_url) except requests.exceptions.RequestException as e: print(f"Error crawling {current_url}: {e}") except Exception as e: print(f"An unexpected error occurred with {current_url}: {e}") time.sleep(1) # Be polite and avoid overwhelming the server print(f"Finished crawling. Scraped {len(scraped_data)} pages.") return scraped_data def update_vectorstore_from_crawl(url_to_crawl="https://www.atomcamp.com/", max_pages=20): """ Performs a web crawl and updates the FAISS vector store with the scraped data. """ global vectorstore, retriever print("Initiating website crawl for vector store update...") scraped_data = crawl_website(url_to_crawl, max_pages) if not scraped_data: print("No data scraped from the website. Using sample data as fallback.") # Fallback to sample data if crawl fails or yields no content sample_data = [ { "text": "Atomcamp is a leading data science education platform offering comprehensive courses in machine learning, Python programming, data analysis, and AI. We provide hands-on projects, expert mentorship, and career guidance to help students become successful data scientists.", "url": "https://www.atomcamp.com/about" }, { "text": "Our courses include: Python for Data Science, Machine Learning Fundamentals, Deep Learning with TensorFlow, Data Visualization with Matplotlib and Seaborn, SQL for Data Analysis, Statistics for Data Science, and Advanced AI Techniques.", "url": "https://www.atomcamp.com/courses" }, { "text": "Atomcamp offers flexible learning paths: Beginner Track (3 months) - Python basics, data manipulation, basic statistics. Intermediate Track (6 months) - Machine learning, advanced Python, real projects. Advanced Track (9 months) - Deep learning, AI, industry projects, job placement assistance.", "url": "https://www.atomcamp.com/learning-paths" } ] docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in sample_data] else: docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in scraped_data] splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_documents(docs) vectorstore = FAISS.from_documents(chunks, embeddings) vectorstore.save_local("atomcamp_vector_db") retriever = vectorstore.as_retriever() print("Vectorstore updated successfully from crawled data (or sample fallback).") return "Vectorstore updated successfully" def initialize_vectorstore(): global vectorstore, retriever try: # Attempt to load existing vectorstore vectorstore = FAISS.load_local("atomcamp_vector_db", embeddings, allow_dangerous_deserialization=True) retriever = vectorstore.as_retriever() print("Vectorstore loaded successfully from local storage.") return "Vectorstore loaded successfully" except Exception as e: print(f"Failed to load vectorstore: {e}. Attempting to crawl and build.") # If loading fails, crawl the website and build a new one return update_vectorstore_from_crawl() def call_groq_api(message, context): global groq_api_key if not groq_api_key: return None try: system_prompt = f"""You are an AI assistant for Atomcamp, a data science education platform. Use the following context to answer questions about Atomcamp's courses, career services, and data science topics. Context: {context} Guidelines: - Be helpful and informative - Focus on Atomcamp's offerings - Provide specific details when available - Use bullet points for lists - Keep responses concise but comprehensive - Do not use emojis """ headers = { "Authorization": f"Bearer {groq_api_key}", "Content-Type": "application/json" } data = { "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": message} ], "model": "llama3-8b-8192", "temperature": 0.7, "max_tokens": 1000 } response = requests.post( "https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data, timeout=30 ) if response.status_code == 200: result = response.json() return result["choices"][0]["message"]["content"] else: print(f"Groq API Error: {response.status_code} - {response.text}") return None except Exception as e: print(f"Error calling Groq API: {e}") return None def generate_response(message, context): groq_response = call_groq_api(message, context) if groq_response: return groq_response # Fallback responses (less likely to be hit with web scraping) message_lower = message.lower() if any(word in message_lower for word in ['course', 'courses', 'learn', 'study']): return """Atomcamp Courses: Core Programs: • Python for Data Science - Master Python programming fundamentals • Machine Learning Fundamentals - Learn ML algorithms and applications • Deep Learning with TensorFlow - Build neural networks and AI models • Data Visualization - Create stunning charts with Matplotlib & Seaborn • SQL for Data Analysis - Database querying and data manipulation • Statistics for Data Science - Statistical analysis and hypothesis testing Learning Tracks: • Beginner Track (3 months) - Perfect for newcomers • Intermediate Track (6 months) - Build real-world projects • Advanced Track (9 months) - Industry-ready with job placement Would you like details about any specific course?""" elif any(word in message_lower for word in ['career', 'job', 'placement']): return """Career Services at Atomcamp: Job Placement Support: • Resume building and optimization • Technical interview preparation • Portfolio development guidance • Direct connections with hiring partners • Mock interviews with industry experts Career Growth: • Average salary increase: 150-300% • 95% job placement rate within 6 months • Access to exclusive job opportunities • Ongoing career mentorship • Industry networking events Ready to transform your career in data science?""" else: return f"""Thank you for your question about "{message}"! As an Atomcamp AI assistant, I'm here to help you with: Course Information - Learn about our data science programs Career Guidance - Job placement and career growth Technical Topics - Python, ML, AI, and data analysis Getting Started - How to begin your data science journey Would you like me to elaborate on any specific aspect?""" @app.route('/') def index(): html_template = """ atomcamp AI Chatbot

Chatbot

Online

Welcome to atomcamp AI

Ask me about courses, data science concepts, and learning paths.

Ask me about:

  • • Course information
  • • Data science concepts
  • • Learning paths

Try asking:

  • • "What courses do you offer?"
  • • "Explain machine learning"
  • • "How do I get started?"
Connected to atomcamp AI 0/1000
""" return render_template_string(html_template) @app.route('/chat', methods=['POST']) def chat(): try: data = request.get_json() message = data.get('message', '') if not message: return jsonify({'error': 'No message provided'}), 400 # Retrieve relevant documents based on the user's query if retriever: docs = retriever.get_relevant_documents(message) context = "\n\n".join([doc.page_content for doc in docs[:5]]) # Get top 5 relevant documents print(f"Context from retriever: {context[:200]}...") # Print first 200 chars for debug else: context = "I'm an AI assistant for Atomcamp, a data science education platform." print("Retriever not initialized, using default context.") response = generate_response(message, context) return jsonify({'response': response}) except Exception as e: print(f"Error in chat endpoint: {e}") return jsonify({'error': f'Error: {str(e)}'}), 500 @app.route('/static/') def static_files(filename): return send_from_directory('static', filename) # Initialize systems initialize_groq() initialize_vectorstore() if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) app.run(host="0.0.0.0", port=port, debug=False)