Al1Abdullah commited on
Commit
bb30230
·
verified ·
1 Parent(s): edb4ebc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -22
app.py CHANGED
@@ -6,6 +6,10 @@ from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
  import requests
 
 
 
 
9
 
10
  load_dotenv()
11
 
@@ -22,14 +26,80 @@ def initialize_groq():
22
  groq_api_key = os.getenv("GROQ_API_KEY")
23
  return "Groq API key found" if groq_api_key else "Groq API key not found"
24
 
25
- def initialize_vectorstore():
26
- global vectorstore, retriever
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- try:
29
- vectorstore = FAISS.load_local("atomcamp_vector_db", embeddings, allow_dangerous_deserialization=True)
30
- retriever = vectorstore.as_retriever()
31
- return "Vectorstore loaded successfully"
32
- except:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  sample_data = [
34
  {
35
  "text": "Atomcamp is a leading data science education platform offering comprehensive courses in machine learning, Python programming, data analysis, and AI. We provide hands-on projects, expert mentorship, and career guidance to help students become successful data scientists.",
@@ -44,16 +114,33 @@ def initialize_vectorstore():
44
  "url": "https://www.atomcamp.com/learning-paths"
45
  }
46
  ]
47
-
48
  docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in sample_data]
49
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
50
- chunks = splitter.split_documents(docs)
51
-
52
- vectorstore = FAISS.from_documents(chunks, embeddings)
53
- vectorstore.save_local("atomcamp_vector_db")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  retriever = vectorstore.as_retriever()
55
-
56
- return "Sample vectorstore created successfully"
 
 
 
 
57
 
58
  def call_groq_api(message, context):
59
  global groq_api_key
@@ -102,9 +189,11 @@ def call_groq_api(message, context):
102
  result = response.json()
103
  return result["choices"][0]["message"]["content"]
104
  else:
 
105
  return None
106
 
107
  except Exception as e:
 
108
  return None
109
 
110
  def generate_response(message, context):
@@ -113,7 +202,7 @@ def generate_response(message, context):
113
  if groq_response:
114
  return groq_response
115
 
116
- # Fallback responses
117
  message_lower = message.lower()
118
 
119
  if any(word in message_lower for word in ['course', 'courses', 'learn', 'study']):
@@ -795,13 +884,13 @@ def index():
795
  if (line.trim().startsWith('•') || line.trim().startsWith('-')) {
796
  return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
797
  <span style="color: #16a34a; margin-right: 0.5rem; margin-top: 0.125rem; font-size: 0.875rem; font-weight: 500;">•</span>
798
- <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^[•-]\\s*/, '')}</span>
799
  </div>`;
800
- } else if (/^\\d+\\./.test(line.trim())) {
801
- const match = line.match(/^\\d+\\./);
802
  return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
803
  <span style="color: #16a34a; margin-right: 0.5rem; font-weight: 600; font-size: 0.875rem;">${match ? match[0] : ''}</span>
804
- <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^\\d+\\.\\s*/, '')}</span>
805
  </div>`;
806
  } else if (line.trim() === '') {
807
  return '<br>';
@@ -852,16 +941,20 @@ def chat():
852
  if not message:
853
  return jsonify({'error': 'No message provided'}), 400
854
 
 
855
  if retriever:
856
  docs = retriever.get_relevant_documents(message)
857
- context = "\n\n".join([doc.page_content for doc in docs[:3]])
 
858
  else:
859
  context = "I'm an AI assistant for Atomcamp, a data science education platform."
860
-
 
861
  response = generate_response(message, context)
862
  return jsonify({'response': response})
863
 
864
  except Exception as e:
 
865
  return jsonify({'error': f'Error: {str(e)}'}), 500
866
 
867
  @app.route('/static/<path:filename>')
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
  import requests
9
+ from bs4 import BeautifulSoup # New
10
+ from collections import deque # New
11
+ from urllib.parse import urljoin, urlparse # New
12
+ import time # New for crawl delay
13
 
14
  load_dotenv()
15
 
 
26
  groq_api_key = os.getenv("GROQ_API_KEY")
27
  return "Groq API key found" if groq_api_key else "Groq API key not found"
28
 
29
+ def crawl_website(url_to_crawl, max_pages=10):
30
+ """
31
+ Crawls a website to extract text content and links.
32
+ Args:
33
+ url_to_crawl (str): The starting URL for the crawl.
34
+ max_pages (int): Maximum number of pages to crawl.
35
+ Returns:
36
+ list: A list of dictionaries, each containing 'text' and 'url' of crawled pages.
37
+ """
38
+ base_domain = urlparse(url_to_crawl).netloc
39
+ queue = deque([url_to_crawl])
40
+ visited_urls = set()
41
+ scraped_data = []
42
 
43
+ print(f"Starting crawl from: {url_to_crawl}")
44
+
45
+ while queue and len(scraped_data) < max_pages:
46
+ current_url = queue.popleft()
47
+
48
+ if current_url in visited_urls:
49
+ continue
50
+
51
+ print(f"Crawling: {current_url}")
52
+ visited_urls.add(current_url)
53
+
54
+ try:
55
+ response = requests.get(current_url, timeout=10)
56
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
57
+ soup = BeautifulSoup(response.text, 'lxml')
58
+
59
+ # Extract text content
60
+ page_text = ' '.join(p.get_text() for p in soup.find_all('p'))
61
+ page_text += ' '.join(li.get_text() for li in soup.find_all('li'))
62
+ page_text += ' '.join(h.get_text() for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))
63
+
64
+ # Clean up extra whitespace and newlines
65
+ page_text = ' '.join(page_text.split()).strip()
66
+
67
+ if page_text:
68
+ scraped_data.append({"text": page_text, "url": current_url})
69
+
70
+ # Extract links
71
+ for link in soup.find_all('a', href=True):
72
+ href = link['href']
73
+ absolute_url = urljoin(current_url, href)
74
+ parsed_absolute_url = urlparse(absolute_url)
75
+
76
+ # Only follow links within the same domain and not already visited
77
+ if parsed_absolute_url.netloc == base_domain and absolute_url not in visited_urls:
78
+ if '#' not in absolute_url and 'mailto:' not in absolute_url and 'tel:' not in absolute_url: # Avoid anchor links and mail/tel links
79
+ queue.append(absolute_url)
80
+
81
+ except requests.exceptions.RequestException as e:
82
+ print(f"Error crawling {current_url}: {e}")
83
+ except Exception as e:
84
+ print(f"An unexpected error occurred with {current_url}: {e}")
85
+
86
+ time.sleep(1) # Be polite and avoid overwhelming the server
87
+
88
+ print(f"Finished crawling. Scraped {len(scraped_data)} pages.")
89
+ return scraped_data
90
+
91
+ def update_vectorstore_from_crawl(url_to_crawl="https://www.atomcamp.com/", max_pages=20):
92
+ """
93
+ Performs a web crawl and updates the FAISS vector store with the scraped data.
94
+ """
95
+ global vectorstore, retriever
96
+
97
+ print("Initiating website crawl for vector store update...")
98
+ scraped_data = crawl_website(url_to_crawl, max_pages)
99
+
100
+ if not scraped_data:
101
+ print("No data scraped from the website. Using sample data as fallback.")
102
+ # Fallback to sample data if crawl fails or yields no content
103
  sample_data = [
104
  {
105
  "text": "Atomcamp is a leading data science education platform offering comprehensive courses in machine learning, Python programming, data analysis, and AI. We provide hands-on projects, expert mentorship, and career guidance to help students become successful data scientists.",
 
114
  "url": "https://www.atomcamp.com/learning-paths"
115
  }
116
  ]
 
117
  docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in sample_data]
118
+ else:
119
+ docs = [Document(page_content=item["text"], metadata={"url": item["url"]}) for item in scraped_data]
120
+
121
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
122
+ chunks = splitter.split_documents(docs)
123
+
124
+ vectorstore = FAISS.from_documents(chunks, embeddings)
125
+ vectorstore.save_local("atomcamp_vector_db")
126
+ retriever = vectorstore.as_retriever()
127
+ print("Vectorstore updated successfully from crawled data (or sample fallback).")
128
+ return "Vectorstore updated successfully"
129
+
130
+
131
+ def initialize_vectorstore():
132
+ global vectorstore, retriever
133
+
134
+ try:
135
+ # Attempt to load existing vectorstore
136
+ vectorstore = FAISS.load_local("atomcamp_vector_db", embeddings, allow_dangerous_deserialization=True)
137
  retriever = vectorstore.as_retriever()
138
+ print("Vectorstore loaded successfully from local storage.")
139
+ return "Vectorstore loaded successfully"
140
+ except Exception as e:
141
+ print(f"Failed to load vectorstore: {e}. Attempting to crawl and build.")
142
+ # If loading fails, crawl the website and build a new one
143
+ return update_vectorstore_from_crawl()
144
 
145
  def call_groq_api(message, context):
146
  global groq_api_key
 
189
  result = response.json()
190
  return result["choices"][0]["message"]["content"]
191
  else:
192
+ print(f"Groq API Error: {response.status_code} - {response.text}")
193
  return None
194
 
195
  except Exception as e:
196
+ print(f"Error calling Groq API: {e}")
197
  return None
198
 
199
  def generate_response(message, context):
 
202
  if groq_response:
203
  return groq_response
204
 
205
+ # Fallback responses (less likely to be hit with web scraping)
206
  message_lower = message.lower()
207
 
208
  if any(word in message_lower for word in ['course', 'courses', 'learn', 'study']):
 
884
  if (line.trim().startsWith('•') || line.trim().startsWith('-')) {
885
  return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
886
  <span style="color: #16a34a; margin-right: 0.5rem; margin-top: 0.125rem; font-size: 0.875rem; font-weight: 500;">•</span>
887
+ <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^[•-]\s*/, '')}</span>
888
  </div>`;
889
+ } else if (/^\d+\./.test(line.trim())) {
890
+ const match = line.match(/^\d+\./);
891
  return `<div style="display: flex; align-items: flex-start; margin-bottom: 0.375rem;">
892
  <span style="color: #16a34a; margin-right: 0.5rem; font-weight: 600; font-size: 0.875rem;">${match ? match[0] : ''}</span>
893
+ <span style="font-size: 0.875rem; line-height: 1.5;">${line.replace(/^\d+\.\s*/, '')}</span>
894
  </div>`;
895
  } else if (line.trim() === '') {
896
  return '<br>';
 
941
  if not message:
942
  return jsonify({'error': 'No message provided'}), 400
943
 
944
+ # Retrieve relevant documents based on the user's query
945
  if retriever:
946
  docs = retriever.get_relevant_documents(message)
947
+ context = "\n\n".join([doc.page_content for doc in docs[:5]]) # Get top 5 relevant documents
948
+ print(f"Context from retriever: {context[:200]}...") # Print first 200 chars for debug
949
  else:
950
  context = "I'm an AI assistant for Atomcamp, a data science education platform."
951
+ print("Retriever not initialized, using default context.")
952
+
953
  response = generate_response(message, context)
954
  return jsonify({'response': response})
955
 
956
  except Exception as e:
957
+ print(f"Error in chat endpoint: {e}")
958
  return jsonify({'error': f'Error: {str(e)}'}), 500
959
 
960
  @app.route('/static/<path:filename>')