Spaces:

pradeep4321
/

sample_crawler

Sleeping

App Files Files Community

pradeep4321 commited on Apr 15

Commit

ec67f77

verified ·

1 Parent(s): fd59730

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +225 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,227 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# =========================================================
+# 🌐 WEBSITE RAG + IMAGE UNDERSTANDING (HF SPACES)
+# =========================================================
 import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import numpy as np
+import faiss
+import torch
+from PIL import Image
+from io import BytesIO
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
+# ==============================
+# PAGE CONFIG
+# ==============================
+st.set_page_config(page_title="🌐 Website QA System", layout="wide")
+# ==============================
+# LOAD MODELS
+# ==============================
+@st.cache_resource
+def load_models():
+    embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    qa_pipeline = pipeline(
+        "text2text-generation",
+        model="google/flan-t5-base",
+        max_length=256
+    )
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+    image_model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    return embed_model, qa_pipeline, processor, image_model
+embed_model, qa_pipeline, processor, image_model = load_models()
+# ==============================
+# SESSION STATE
+# ==============================
+if "documents" not in st.session_state:
+    st.session_state.documents = []
+if "index" not in st.session_state:
+    st.session_state.index = None
+# ==============================
+# CRAWL WEBSITE
+# ==============================
+def crawl_website(url):
+    try:
+        res = requests.get(url)
+        soup = BeautifulSoup(res.text, "html.parser")
+        links = []
+        for a in soup.find_all("a", href=True):
+            link = a["href"]
+            if link.startswith("http"):
+                links.append(link)
+        return list(set(links))[:20]  # limit
+    except:
+        return []
+# ==============================
+# EXTRACT CONTENT
+# ==============================
+def extract_content(url):
+    try:
+        res = requests.get(url)
+        soup = BeautifulSoup(res.text, "html.parser")
+        # TEXT
+        paragraphs = [p.get_text() for p in soup.find_all("p")]
+        text = " ".join(paragraphs)
+        # IMAGES → CAPTION
+        image_texts = []
+        images = soup.find_all("img")
+        for img in images[:5]:  # limit images
+            try:
+                img_url = img.get("src")
+                if not img_url.startswith("http"):
+                    continue
+                img_res = requests.get(img_url)
+                image = Image.open(BytesIO(img_res.content)).convert("RGB")
+                inputs = processor(image, return_tensors="pt")
+                out = image_model.generate(**inputs)
+                caption = processor.decode(out[0], skip_special_tokens=True)
+                image_texts.append(caption)
+            except:
+                continue
+        full_text = text + " " + " ".join(image_texts)
+        return full_text
+    except:
+        return ""
+# ==============================
+# CHUNKING
+# ==============================
+def chunk_text(text, size=300):
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), size):
+        chunks.append(" ".join(words[i:i+size]))
+    return chunks
+# ==============================
+# BUILD INDEX
+# ==============================
+def build_index(texts):
+    embeddings = embed_model.encode(texts)
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(np.array(embeddings))
+    return index, embeddings
+# ==============================
+# UI
+# ==============================
+st.title("🌐 Website QA with Images")
+url = st.text_input("🔗 Enter Website URL")
+if st.button("Crawl Website"):
+    links = crawl_website(url)
+    if links:
+        st.session_state.links = links
+        st.success(f"Found {len(links)} pages")
+    else:
+        st.error("No links found")
+# ==============================
+# PAGE SELECTION
+# ==============================
+if "links" in st.session_state:
+    st.subheader("Select Pages to Train")
+    selected_links = []
+    for link in st.session_state.links:
+        if st.checkbox(link):
+            selected_links.append(link)
+    if st.button("Train Selected Pages"):
+        all_chunks = []
+        with st.spinner("Processing pages..."):
+            for link in selected_links:
+                content = extract_content(link)
+                chunks = chunk_text(content)
+                all_chunks.extend(chunks)
+        if all_chunks:
+            index, embeddings = build_index(all_chunks)
+            st.session_state.documents = all_chunks
+            st.session_state.index = index
+            st.success("Training completed!")
+# ==============================
+# ADD MORE PAGES
+# ==============================
+if "links" in st.session_state:
+    st.subheader("➕ Add More Pages")
+    new_url = st.text_input("Add another URL")
+    if st.button("Add & Train"):
+        content = extract_content(new_url)
+        chunks = chunk_text(content)
+        if chunks:
+            new_embeddings = embed_model.encode(chunks)
+            st.session_state.index.add(np.array(new_embeddings))
+            st.session_state.documents.extend(chunks)
+            st.success("Added new page!")
+# ==============================
+# ASK QUESTIONS
+# ==============================
+st.subheader("💬 Ask Questions")
+query = st.text_input("Ask something from the website")
+if st.button("Get Answer"):
+    if st.session_state.index is None:
+        st.warning("Please train pages first")
+    else:
+        q_embed = embed_model.encode([query])
+        D, I = st.session_state.index.search(np.array(q_embed), k=5)
+        context = " ".join([st.session_state.documents[i] for i in I[0]])
+        prompt = f"""
+        Answer the question based on the context.
+        Context:
+        {context}
+        Question:
+        {query}
+        """
+        answer = qa_pipeline(prompt)[0]["generated_text"]
+        st.write("### ✅ Answer")
+        st.write(answer)