Spaces:

pradeep4321
/

sample_crawler

Sleeping

App Files Files Community

pradeep4321 commited on Apr 15

Commit

46325f0

verified ·

1 Parent(s): 2f54431

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +56 -160

src/streamlit_app.py CHANGED Viewed

@@ -1,61 +1,27 @@
 # =========================================================
-# 🌐 WEBSITE RAG + IMAGE QA (HF SPACES FIXED VERSION)
 # =========================================================
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
-import numpy as np
-import faiss
-import torch
-from PIL import Image
-from io import BytesIO
 from urllib.parse import urljoin
-from sentence_transformers import SentenceTransformer
-from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
 # ==============================
 # PAGE CONFIG
 # ==============================
-st.set_page_config(page_title="🌐 Website QA System", layout="wide")
-# ==============================
-# LOAD MODELS (FIXED)
-# ==============================
-@st.cache_resource
-def load_models():
-    embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-    # ✅ FIX: use text-generation instead of text2text-generation
-    qa_pipeline = pipeline(
-        "text-generation",
-        model="google/flan-t5-base",
-        max_length=256,
-        do_sample=False
-    )
-    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-    image_model = BlipForConditionalGeneration.from_pretrained(
-        "Salesforce/blip-image-captioning-base"
-    )
-    return embed_model, qa_pipeline, processor, image_model
-embed_model, qa_pipeline, processor, image_model = load_models()
 # ==============================
 # SESSION STATE
 # ==============================
-if "documents" not in st.session_state:
-    st.session_state.documents = []
-if "index" not in st.session_state:
-    st.session_state.index = None
 if "links" not in st.session_state:
     st.session_state.links = []
 # ==============================
 # CRAWL WEBSITE
 # ==============================
@@ -67,19 +33,19 @@ def crawl_website(url):
         links = set()
         for a in soup.find_all("a", href=True):
-            link = urljoin(url, a["href"])  # ✅ FIX relative links
             if link.startswith("http"):
                 links.add(link)
-        return list(links)[:20]
-    except Exception as e:
         return []
 # ==============================
-# EXTRACT CONTENT (TEXT + IMAGES)
 # ==============================
-def extract_content(url):
     try:
         res = requests.get(url, timeout=10)
         soup = BeautifulSoup(res.text, "html.parser")
@@ -88,65 +54,28 @@ def extract_content(url):
         paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
         text = " ".join(paragraphs)
-        # IMAGES → CAPTION
-        image_texts = []
-        images = soup.find_all("img")
-        for img in images[:5]:  # limit
-            try:
-                img_url = urljoin(url, img.get("src"))
-                img_res = requests.get(img_url, timeout=5)
-                image = Image.open(BytesIO(img_res.content)).convert("RGB")
-                inputs = processor(image, return_tensors="pt")
-                out = image_model.generate(**inputs)
-                caption = processor.decode(out[0], skip_special_tokens=True)
-                image_texts.append(caption)
-            except:
-                continue
-        return text + " " + " ".join(image_texts)
     except:
-        return ""
-# ==============================
-# CHUNKING
-# ==============================
-def chunk_text(text, size=300):
-    words = text.split()
-    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]
-# ==============================
-# BUILD FAISS INDEX
-# ==============================
-def build_index(texts):
-    embeddings = embed_model.encode(texts)
-    dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(np.array(embeddings))
-    return index
-# ==============================
-# ADD TO EXISTING INDEX
-# ==============================
-def add_to_index(new_chunks):
-    new_embeddings = embed_model.encode(new_chunks)
-    st.session_state.index.add(np.array(new_embeddings))
-    st.session_state.documents.extend(new_chunks)
 # ==============================
 # UI
 # ==============================
-st.title("🌐 Website QA with Images (Fixed)")
 # ==============================
-# STEP 1: URL INPUT
 # ==============================
 url = st.text_input("🔗 Enter Website URL")
@@ -157,92 +86,59 @@ if st.button("Crawl Website"):
         st.session_state.links = links
         st.success(f"Found {len(links)} pages")
     else:
-        st.error("No links found or invalid URL")
 # ==============================
-# STEP 2: PAGE SELECTION
 # ==============================
 selected_links = []
 if st.session_state.links:
-    st.subheader("📄 Select Pages to Train")
     for link in st.session_state.links:
         if st.checkbox(link):
             selected_links.append(link)
-    if st.button("Train Selected Pages"):
-        all_chunks = []
-        with st.spinner("Processing pages..."):
-            for link in selected_links:
-                content = extract_content(link)
-                chunks = chunk_text(content)
-                all_chunks.extend(chunks)
-        if all_chunks:
-            st.session_state.index = build_index(all_chunks)
-            st.session_state.documents = all_chunks
-            st.success("✅ Training completed!")
-        else:
-            st.warning("No content extracted")
 # ==============================
-# STEP 3: ADD MORE PAGES
 # ==============================
-st.subheader("➕ Add More Pages")
-new_url = st.text_input("Enter another page URL")
-if st.button("Add & Train"):
-    content = extract_content(new_url)
-    chunks = chunk_text(content)
-    if chunks:
-        if st.session_state.index is None:
-            st.session_state.index = build_index(chunks)
-            st.session_state.documents = chunks
-        else:
-            add_to_index(chunks)
-        st.success("✅ Page added successfully!")
     else:
-        st.error("Failed to extract content")
 # ==============================
-# STEP 4: ASK QUESTIONS
 # ==============================
-st.subheader("💬 Ask Questions")
-query = st.text_input("Ask something from the website")
-if st.button("Get Answer"):
-    if st.session_state.index is None:
-        st.warning("⚠️ Please train pages first")
-    else:
-        q_embed = embed_model.encode([query])
-        D, I = st.session_state.index.search(np.array(q_embed), k=5)
-        context = " ".join([st.session_state.documents[i] for i in I[0]])
-        prompt = f"""
-        Answer based only on the context.
-        Context:
-        {context}
-        Question:
-        {query}
-        Answer:
-        """
-        response = qa_pipeline(prompt)[0]["generated_text"]
-        # ✅ CLEAN OUTPUT
-        answer = response.replace(prompt, "").strip()
-        st.write("### ✅ Answer")
-        st.write(answer if answer else "No relevant answer found")

 # =========================================================
+# 🌐 WEBSITE CRAWLER + DOWNLOAD TOOL
 # =========================================================
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
+import pandas as pd
 from urllib.parse import urljoin
 # ==============================
 # PAGE CONFIG
 # ==============================
+st.set_page_config(page_title="🌐 Website Crawler", layout="wide")
 # ==============================
 # SESSION STATE
 # ==============================
 if "links" not in st.session_state:
     st.session_state.links = []
+if "data" not in st.session_state:
+    st.session_state.data = []
 # ==============================
 # CRAWL WEBSITE
 # ==============================
         links = set()
         for a in soup.find_all("a", href=True):
+            link = urljoin(url, a["href"])
             if link.startswith("http"):
                 links.add(link)
+        return list(links)[:30]
+    except:
         return []
 # ==============================
+# EXTRACT PAGE CONTENT
 # ==============================
+def extract_page(url):
     try:
         res = requests.get(url, timeout=10)
         soup = BeautifulSoup(res.text, "html.parser")
         paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
         text = " ".join(paragraphs)
+        # IMAGES
+        images = []
+        for img in soup.find_all("img"):
+            img_url = urljoin(url, img.get("src"))
+            images.append(img_url)
+        return {
+            "url": url,
+            "text": text,
+            "images": images
+        }
     except:
+        return None
 # ==============================
 # UI
 # ==============================
+st.title("🌐 Website Crawler + Downloader")
 # ==============================
+# STEP 1: ENTER URL
 # ==============================
 url = st.text_input("🔗 Enter Website URL")
         st.session_state.links = links
         st.success(f"Found {len(links)} pages")
     else:
+        st.error("No links found")
 # ==============================
+# STEP 2: SELECT PAGES
 # ==============================
 selected_links = []
 if st.session_state.links:
+    st.subheader("📄 Select Pages to Crawl")
     for link in st.session_state.links:
         if st.checkbox(link):
             selected_links.append(link)
 # ==============================
+# STEP 3: EXTRACT DATA
 # ==============================
+if st.button("Extract Selected Pages"):
+    all_data = []
+    with st.spinner("Extracting content..."):
+        for link in selected_links:
+            data = extract_page(link)
+            if data:
+                all_data.append(data)
+    if all_data:
+        st.session_state.data = all_data
+        st.success("✅ Data extracted successfully!")
     else:
+        st.warning("No data extracted")
 # ==============================
+# STEP 4: SHOW DATA
 # ==============================
+if st.session_state.data:
+    st.subheader("📊 Extracted Data Preview")
+    df = pd.DataFrame(st.session_state.data)
+    st.dataframe(df)
+# ==============================
+# STEP 5: DOWNLOAD OPTIONS
+# ==============================
+if st.session_state.data:
+    st.subheader("⬇️ Download Data")
+    df = pd.DataFrame(st.session_state.data)
+    # CSV
+    csv = df.to_csv(index=False).encode("utf-8")
+    st.download_button("Download CSV", csv, "website_data.csv", "text/csv")
+    # JSON
+    json_data = df.to_json(orient="records", indent=2)
+    st.download_button("Download JSON", json_data, "website_data.json", "application/json")