File size: 1,719 Bytes
bb56a4a 477440f bb56a4a 477440f bb56a4a 477440f bb56a4a 477440f bb56a4a 477440f bb56a4a 477440f bb56a4a 477440f bb56a4a 477440f bb56a4a 477440f bb56a4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import requests
from bs4 import BeautifulSoup
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import streamlit as st
# Step 1: Scrape Website Data
url = "https://aspireec.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract data (e.g., headlines, paragraphs, etc.)
content = soup.find_all('p') # Example: extracting paragraphs
website_data = [p.text.strip() for p in content if p.text.strip()]
# Save the extracted content to a JSON file
with open('website_data.json', 'w') as file:
json.dump(website_data, file)
# Step 2: Create Embeddings and FAISS Index
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(website_data)
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
# Step 3: Summarization Model
summarizer = pipeline("summarization", model="google/flan-t5-base")
# Step 4: Define the `get_answer` Function
def get_answer(query):
# Encode the query
query_embedding = model.encode([query])
distances, indices = index.search(np.array(query_embedding), k=1)
# Retrieve the best match
best_match = website_data[indices[0][0]]
# Generate a summarized response
summarized_response = summarizer(best_match, max_length=50, min_length=10, do_sample=False)
return summarized_response[0]['summary_text']
# Step 5: Streamlit Chatbot UI
st.title("Website Chatbot")
user_input = st.text_input("Ask me anything about the website:")
if user_input:
response = get_answer(user_input) # Query the FAISS index and summarize the response
st.write(response)
|