varshasharma01 commited on
Commit
e6fa2c9
·
verified ·
1 Parent(s): abadfbe

Update src/main.py

Browse files
Files changed (1) hide show
  1. src/main.py +66 -5
src/main.py CHANGED
@@ -11,7 +11,7 @@ import requests
11
  from bs4 import BeautifulSoup
12
  import base64
13
  import io
14
-
15
  from PIL import Image
16
  from urllib.parse import urlparse, parse_qs
17
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
@@ -248,32 +248,93 @@ def generate_image_answer(query, image):
248
 
249
  # -------- URL HELPERS --------
250
 
 
 
 
 
 
 
 
251
  def extract_text_from_url(url: str):
252
  try:
 
 
253
  headers = {"User-Agent": "Mozilla/5.0"}
254
  response = requests.get(url, headers=headers, timeout=10)
 
255
  if response.status_code != 200:
256
  return None
 
257
  soup = BeautifulSoup(response.text, "html.parser")
 
 
258
  for tag in soup(["script", "style"]):
259
  tag.decompose()
 
260
  text = soup.get_text(separator=" ")
 
261
  return text[:5000]
 
262
  except Exception as e:
263
  print(f"URL Error: {e}")
264
  return None
265
 
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  def generate_url_answer(url: str, query: str = None):
268
- text = extract_text_from_url(url)
269
- if not text:
 
 
 
 
 
 
270
  return "Could not fetch content from this URL."
 
271
  try:
272
  user_msg = (
273
  f"URL: {url}\n\n"
274
- f"Content:\n{text}\n\n"
275
  f"Task: Explain what this page is about in simple words."
276
  )
 
277
  if query:
278
  user_msg += f"\n\nUser Question: {query}"
279
 
@@ -290,12 +351,12 @@ def generate_url_answer(url: str, query: str = None):
290
  }
291
  ]
292
  )
 
293
  return response.choices[0].message.content
294
 
295
  except Exception as e:
296
  return f"Error: {str(e)}"
297
 
298
-
299
  # -------- YOUTUBE HELPERS --------
300
 
301
  def get_video_id(url: str):
 
11
  from bs4 import BeautifulSoup
12
  import base64
13
  import io
14
+ from urllib.parse import urljoin
15
  from PIL import Image
16
  from urllib.parse import urlparse, parse_qs
17
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 
248
 
249
  # -------- URL HELPERS --------
250
 
251
+ def normalize_url(url: str):
252
+ url = url.strip()
253
+ if not url.startswith(("http://", "https://")):
254
+ url = "https://" + url
255
+ return url
256
+
257
+
258
  def extract_text_from_url(url: str):
259
  try:
260
+ url = normalize_url(url) # ✅ FIX ADDED
261
+
262
  headers = {"User-Agent": "Mozilla/5.0"}
263
  response = requests.get(url, headers=headers, timeout=10)
264
+
265
  if response.status_code != 200:
266
  return None
267
+
268
  soup = BeautifulSoup(response.text, "html.parser")
269
+
270
+ # Remove unwanted tags
271
  for tag in soup(["script", "style"]):
272
  tag.decompose()
273
+
274
  text = soup.get_text(separator=" ")
275
+
276
  return text[:5000]
277
+
278
  except Exception as e:
279
  print(f"URL Error: {e}")
280
  return None
281
 
282
 
283
+ def extract_about_contact(base_url: str):
284
+ """Optional enhancement: fetch About & Contact pages"""
285
+ try:
286
+ base_url = normalize_url(base_url)
287
+
288
+ headers = {"User-Agent": "Mozilla/5.0"}
289
+ response = requests.get(base_url, headers=headers, timeout=10)
290
+ soup = BeautifulSoup(response.text, "html.parser")
291
+
292
+ links = [a.get("href") for a in soup.find_all("a", href=True)]
293
+
294
+ about_url = None
295
+ contact_url = None
296
+
297
+ for link in links:
298
+ full_link = urljoin(base_url, link)
299
+
300
+ if "about" in link.lower() and not about_url:
301
+ about_url = full_link
302
+
303
+ if "contact" in link.lower() and not contact_url:
304
+ contact_url = full_link
305
+
306
+ content = ""
307
+
308
+ if about_url:
309
+ content += extract_text_from_url(about_url) or ""
310
+
311
+ if contact_url:
312
+ content += extract_text_from_url(contact_url) or ""
313
+
314
+ return content[:5000]
315
+
316
+ except:
317
+ return ""
318
+
319
+
320
  def generate_url_answer(url: str, query: str = None):
321
+ url = normalize_url(url) # ✅ FIX ADDED
322
+
323
+ main_text = extract_text_from_url(url)
324
+ extra_text = extract_about_contact(url) # ✅ NEW (optional)
325
+
326
+ text = (main_text or "") + "\n\n" + (extra_text or "")
327
+
328
+ if not text.strip():
329
  return "Could not fetch content from this URL."
330
+
331
  try:
332
  user_msg = (
333
  f"URL: {url}\n\n"
334
+ f"Content:\n{text[:5000]}\n\n"
335
  f"Task: Explain what this page is about in simple words."
336
  )
337
+
338
  if query:
339
  user_msg += f"\n\nUser Question: {query}"
340
 
 
351
  }
352
  ]
353
  )
354
+
355
  return response.choices[0].message.content
356
 
357
  except Exception as e:
358
  return f"Error: {str(e)}"
359
 
 
360
  # -------- YOUTUBE HELPERS --------
361
 
362
  def get_video_id(url: str):