| import streamlit as st | |
| import requests | |
| import re | |
| from bs4 import BeautifulSoup | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| import chromadb | |
| from sentence_transformers import SentenceTransformer | |
| import google.generativeai as genai | |
| genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo") | |
| CHROMA_PATH = "chroma_db" | |
| chroma_client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| collection = chroma_client.get_or_create_collection(name="formula_1") | |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def clean_text(text): | |
| text = re.sub(r'http\S+', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def split_content_into_chunks(content): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len) | |
| documents = [Document(page_content=content)] | |
| return text_splitter.split_documents(documents) | |
| def add_chunks_to_db(chunks): | |
| documents = [chunk.page_content for chunk in chunks] | |
| ids = [f"ID{i}" for i in range(len(chunks))] | |
| embeddings = embedding_model.encode(documents, convert_to_list=True) | |
| collection.upsert(documents=documents, ids=ids, embeddings=embeddings) | |
| def scrape_text(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| text = clean_text(soup.get_text()) | |
| chunks = split_content_into_chunks(text) | |
| add_chunks_to_db(chunks) | |
| return "Scraping and processing complete. You can now ask questions!" | |
| except requests.exceptions.RequestException as e: | |
| return f"Error scraping {url}: {e}" | |
| def ask_question(query): | |
| query_embedding = embedding_model.encode(query, convert_to_list=True) | |
| results = collection.query(query_embeddings=[query_embedding], n_results=2) | |
| top_chunks = results.get("documents", [[]])[0] | |
| system_prompt = """ | |
| You are a Formula 1 expert. You answer questions about Formula 1. | |
| But you only answer based on knowledge I'm providing you. You don't use your internal | |
| knowledge and you don't make things up. | |
| If you don't know the answer, just say: I don't know. | |
| """ + str(top_chunks) | |
| full_prompt = system_prompt + "\nUser Query: " + query | |
| model = genai.GenerativeModel('gemini-2.0-flash') | |
| response = model.generate_content(full_prompt) | |
| return response.text | |
| st.title("Web Scraping & Chatbot") | |
| url = st.text_input("Enter a URL:") | |
| if url: | |
| if st.button("Scrape & Process"): | |
| result = scrape_text(url) | |
| st.success(result) | |
| if 'scraped' in st.session_state and st.session_state.scraped: | |
| st.subheader("Ask a Question") | |
| query = st.text_input("Enter your question:") | |
| if query: | |
| if st.button("Get Answer"): | |
| answer = ask_question(query) | |
| st.write(answer) | |