Spaces:

muddasser
/

Webscrapping_Playwright

Sleeping

App Files Files Community

muddasser commited on 30 days ago

Commit

02d4635

verified ·

1 Parent(s): 0a25fe2

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -47

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import re
 import logging
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from playwright.sync_api import sync_playwright
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
@@ -16,10 +16,11 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
-MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 st.set_page_config(
-    page_title="RAG · TinyLlama",
     page_icon="🕸️",
     layout="wide",
     initial_sidebar_state="collapsed"
@@ -213,7 +214,6 @@ for key, default in [
 # ── Utilities ──────────────────────────────────────────────────────────────────
 def clean_text(text):
-    # Only collapse whitespace — preserve prices, commas, symbols
     text = re.sub(r'[ \t]+', ' ', text)
     text = re.sub(r'\n{3,}', '\n\n', text)
     return text.strip()
@@ -227,7 +227,7 @@ def is_valid_url(url):
 def load_model():
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             torch_dtype=torch.float32,
             low_cpu_mem_usage=True,
@@ -246,17 +246,15 @@ def scrape_website(url):
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         page = browser.new_page()
         try:
-            # networkidle times out on ad-heavy sites like whatmobile.com.pk
-            # domcontentloaded fires as soon as HTML is parsed, then we wait
-            # a few seconds for JS-rendered content to appear
             try:
                 page.goto(url, wait_until="domcontentloaded", timeout=30000)
             except Exception:
-                pass  # even if it times out, content may already be there
-            page.wait_for_timeout(3000)  # give JS 3s to render
             title = page.title()
-            # Strategy 1: extract from <li> elements — good for listing/price pages
             lines = []
             for li in page.query_selector_all("li"):
                 try:
@@ -305,7 +303,8 @@ def scrape_website(url):
 @st.cache_resource
 def create_vector_store(text):
     try:
-        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
         docs = [Document(page_content=c) for c in splitter.split_text(text)]
         emb = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
@@ -325,54 +324,36 @@ def answer_question(question):
     if tokenizer is None:
         return "Model failed to load. Check logs."
     try:
-        # Retrieve top 3 relevant chunks from FAISS
-        docs    = st.session_state.vector_store.similarity_search(question, k=3)
         context = " ".join(d.page_content for d in docs)
-        # TinyLlama expects the chat template format
-        messages = [
-            {
-                "role": "system",
-                "content": (
-                    "You are a helpful assistant. Answer the user's question using "
-                    "ONLY the context provided. If the answer is not in the context, "
-                    "say \"I don't know\"."
-                ),
-            },
-            {
-                "role": "user",
-                "content": f"Context:\n{context}\n\nQuestion: {question}",
-            },
-        ]
-        # Apply chat template → produces <|system|>...<|user|>...<|assistant|>
-        prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,   # appends <|assistant|> so model starts answering
         )
         inputs = tokenizer(
             prompt,
             return_tensors="pt",
             truncation=True,
-            max_length=2048,             # TinyLlama's full context window
         )
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=300,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.95,
-                repetition_penalty=1.1,
-                pad_token_id=tokenizer.eos_token_id,
             )
-        # Slice off the prompt tokens — only decode what the model generated
-        generated = outputs[0][inputs["input_ids"].shape[1]:]
-        return tokenizer.decode(generated, skip_special_tokens=True).strip()
     except Exception as e:
         logging.error(f"Inference error: {e}")
@@ -388,7 +369,9 @@ with st.sidebar:
     st.markdown("**Model**")
     st.markdown(f"`{MODEL_NAME}`")
     st.markdown("**Context window**")
-    st.markdown("`2048 tokens`")
     st.markdown("**Status**")
     if model_ok:
         st.success("Model loaded ✓")
@@ -407,7 +390,7 @@ st.markdown(f"""
     </div>
     <div class="model-badge">
         <div class="model-dot" style="background:{dot_color};"></div>
-        {dot_label} &nbsp;·&nbsp; TinyLlama-1.1B-Chat
     </div>
 </div>
 """, unsafe_allow_html=True)
@@ -477,7 +460,7 @@ if st.session_state.scraped_content:
         with st.chat_message("user"):
             st.markdown(prompt)
         with st.chat_message("assistant"):
-            with st.spinner("TinyLlama is thinking…"):
                 answer = answer_question(prompt)
             st.markdown(answer)
         st.session_state.chat_history.append({"role": "assistant", "content": answer})

 import re
 import logging
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from playwright.sync_api import sync_playwright
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
+MODEL_NAME    = "google/long-t5-tglobal-large"
+MAX_INPUT_LEN = 16384   # LongT5's full context window
 st.set_page_config(
+    page_title="RAG · LongT5",
     page_icon="🕸️",
     layout="wide",
     initial_sidebar_state="collapsed"
 # ── Utilities ──────────────────────────────────────────────────────────────────
 def clean_text(text):
     text = re.sub(r'[ \t]+', ' ', text)
     text = re.sub(r'\n{3,}', '\n\n', text)
     return text.strip()
 def load_model():
     try:
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        model = AutoModelForSeq2SeqLM.from_pretrained(
             MODEL_NAME,
             torch_dtype=torch.float32,
             low_cpu_mem_usage=True,
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         page = browser.new_page()
         try:
+            # domcontentloaded avoids timeout on ad-heavy sites
             try:
                 page.goto(url, wait_until="domcontentloaded", timeout=30000)
             except Exception:
+                pass  # content may already be loaded even on timeout
+            page.wait_for_timeout(3000)  # allow JS 3s to render
             title = page.title()
+            # Strategy 1: <li> items — great for price/listing pages
             lines = []
             for li in page.query_selector_all("li"):
                 try:
 @st.cache_resource
 def create_vector_store(text):
     try:
+        # Larger chunks since LongT5 can handle much more context
+        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         docs = [Document(page_content=c) for c in splitter.split_text(text)]
         emb = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
     if tokenizer is None:
         return "Model failed to load. Check logs."
     try:
+        # Retrieve more chunks — LongT5 can handle it
+        docs    = st.session_state.vector_store.similarity_search(question, k=6)
         context = " ".join(d.page_content for d in docs)
+        # LongT5 uses plain text prompt like T5 — no chat template needed
+        prompt = (
+            "Answer the question using only the context provided. "
+            "If the answer is not in the context, say \"I don't know\".\n\n"
+            f"Context: {context}\n\n"
+            f"Question: {question}\n\n"
+            "Answer:"
         )
         inputs = tokenizer(
             prompt,
             return_tensors="pt",
             truncation=True,
+            max_length=MAX_INPUT_LEN,   # full 16,384 token window
         )
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=300,
+                num_beams=4,
+                early_stopping=True,
+                no_repeat_ngram_size=3,
             )
+        return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
     except Exception as e:
         logging.error(f"Inference error: {e}")
     st.markdown("**Model**")
     st.markdown(f"`{MODEL_NAME}`")
     st.markdown("**Context window**")
+    st.markdown("`16,384 tokens`")
+    st.markdown("**Architecture**")
+    st.markdown("`Encoder-Decoder`")
     st.markdown("**Status**")
     if model_ok:
         st.success("Model loaded ✓")
     </div>
     <div class="model-badge">
         <div class="model-dot" style="background:{dot_color};"></div>
+        {dot_label} &nbsp;·&nbsp; LongT5-16k
     </div>
 </div>
 """, unsafe_allow_html=True)
         with st.chat_message("user"):
             st.markdown(prompt)
         with st.chat_message("assistant"):
+            with st.spinner("LongT5 is thinking…"):
                 answer = answer_question(prompt)
             st.markdown(answer)
         st.session_state.chat_history.append({"role": "assistant", "content": answer})