Spaces:

varshasharma01
/

Context-Hub

Sleeping

App Files Files Community

varshasharma01 commited on Apr 24

Commit

e6fa2c9

verified ·

1 Parent(s): abadfbe

Update src/main.py

Browse files

Files changed (1) hide show

src/main.py +66 -5

src/main.py CHANGED Viewed

@@ -11,7 +11,7 @@ import requests
 from bs4 import BeautifulSoup
 import base64
 import io
 from PIL import Image
 from urllib.parse import urlparse, parse_qs
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
@@ -248,32 +248,93 @@ def generate_image_answer(query, image):
 # -------- URL HELPERS --------
 def extract_text_from_url(url: str):
     try:
         headers = {"User-Agent": "Mozilla/5.0"}
         response = requests.get(url, headers=headers, timeout=10)
         if response.status_code != 200:
             return None
         soup = BeautifulSoup(response.text, "html.parser")
         for tag in soup(["script", "style"]):
             tag.decompose()
         text = soup.get_text(separator=" ")
         return text[:5000]
     except Exception as e:
         print(f"URL Error: {e}")
         return None
 def generate_url_answer(url: str, query: str = None):
-    text = extract_text_from_url(url)
-    if not text:
         return "Could not fetch content from this URL."
     try:
         user_msg = (
             f"URL: {url}\n\n"
-            f"Content:\n{text}\n\n"
             f"Task: Explain what this page is about in simple words."
         )
         if query:
             user_msg += f"\n\nUser Question: {query}"
@@ -290,12 +351,12 @@ def generate_url_answer(url: str, query: str = None):
                 }
             ]
         )
         return response.choices[0].message.content
     except Exception as e:
         return f"Error: {str(e)}"
 # -------- YOUTUBE HELPERS --------
 def get_video_id(url: str):

 from bs4 import BeautifulSoup
 import base64
 import io
+from urllib.parse import urljoin
 from PIL import Image
 from urllib.parse import urlparse, parse_qs
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 # -------- URL HELPERS --------
+def normalize_url(url: str):
+    url = url.strip()
+    if not url.startswith(("http://", "https://")):
+        url = "https://" + url
+    return url
 def extract_text_from_url(url: str):
     try:
+        url = normalize_url(url)   # ✅ FIX ADDED
         headers = {"User-Agent": "Mozilla/5.0"}
         response = requests.get(url, headers=headers, timeout=10)
         if response.status_code != 200:
             return None
         soup = BeautifulSoup(response.text, "html.parser")
+        # Remove unwanted tags
         for tag in soup(["script", "style"]):
             tag.decompose()
         text = soup.get_text(separator=" ")
         return text[:5000]
     except Exception as e:
         print(f"URL Error: {e}")
         return None
+def extract_about_contact(base_url: str):
+    """Optional enhancement: fetch About & Contact pages"""
+    try:
+        base_url = normalize_url(base_url)
+        headers = {"User-Agent": "Mozilla/5.0"}
+        response = requests.get(base_url, headers=headers, timeout=10)
+        soup = BeautifulSoup(response.text, "html.parser")
+        links = [a.get("href") for a in soup.find_all("a", href=True)]
+        about_url = None
+        contact_url = None
+        for link in links:
+            full_link = urljoin(base_url, link)
+            if "about" in link.lower() and not about_url:
+                about_url = full_link
+            if "contact" in link.lower() and not contact_url:
+                contact_url = full_link
+        content = ""
+        if about_url:
+            content += extract_text_from_url(about_url) or ""
+        if contact_url:
+            content += extract_text_from_url(contact_url) or ""
+        return content[:5000]
+    except:
+        return ""
 def generate_url_answer(url: str, query: str = None):
+    url = normalize_url(url)   # ✅ FIX ADDED
+    main_text = extract_text_from_url(url)
+    extra_text = extract_about_contact(url)   # ✅ NEW (optional)
+    text = (main_text or "") + "\n\n" + (extra_text or "")
+    if not text.strip():
         return "Could not fetch content from this URL."
     try:
         user_msg = (
             f"URL: {url}\n\n"
+            f"Content:\n{text[:5000]}\n\n"
             f"Task: Explain what this page is about in simple words."
         )
         if query:
             user_msg += f"\n\nUser Question: {query}"
                 }
             ]
         )
         return response.choices[0].message.content
     except Exception as e:
         return f"Error: {str(e)}"
 # -------- YOUTUBE HELPERS --------
 def get_video_id(url: str):