import os import streamlit as st import requests from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer import faiss from groq import Groq # Fetch API key from environment variable API_KEY = os.environ.get('GroqApi') global CHUNKS, INDEX, MODEL # Initialize global variables CHUNKS = None INDEX = None MODEL = None # Function to scrape tariff data def scrape_tariff_data(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') tariff_data = [] for paragraph in soup.find_all('p'): tariff_data.append(paragraph.text.strip()) return "\n".join(tariff_data) # Function to chunk text into manageable sizes def chunk_text(text, max_length=512): words = text.split() chunks = [] for i in range(0, len(words), max_length): chunks.append(" ".join(words[i:i+max_length])) return chunks # Function to create embeddings and FAISS index def create_faiss_index(chunks, model_name='all-MiniLM-L6-v2'): model = SentenceTransformer(model_name) embeddings = model.encode(chunks) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) return index, embeddings, model # Function to search FAISS for relevant chunks def search_faiss(query, index, chunks, model, top_k=5): query_embedding = model.encode([query]) distances, indices = index.search(query_embedding, top_k) relevant_chunks = [chunks[i] for i in indices[0] if i < len(chunks)] return relevant_chunks # Function to query the Groq API with augmented query def query_llm(prompt, context): if not API_KEY: return "Error: GROQ_API_KEY is not set in environment variables." client = Groq(api_key=API_KEY) augmented_prompt = f"Based on the following data:\n\n{context}\n\nAnswer the question: {prompt}" chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": augmented_prompt, } ], model="llama3-8b-8192", ) return chat_completion.choices[0].message.content # Streamlit UI st.title("RAG-Based Tariff Data Application") url = st.text_input("Enter Tariff Data URL", "https://iesco.com.pk/index.php/customer-services/tariff-guide") if st.button("Process Tariff Data"): with st.spinner("Extracting and processing data..."): try: #global CHUNKS, INDEX, MODEL # Declare globals before modifying text = scrape_tariff_data(url) if not text: st.error("Failed to scrape data from the provided URL.") st.stop() CHUNKS = chunk_text(text) if not CHUNKS: st.error("No data available for processing.") st.stop() INDEX, embeddings, MODEL = create_faiss_index(CHUNKS) if not INDEX: st.error("Failed to create FAISS index.") st.stop() st.success("Data processed and indexed!") st.write("Number of chunks processed:", len(CHUNKS)) except Exception as e: st.error(f"Error processing data: {e}") st.header("Query the Tariff Data") prompt = st.text_input("Enter your query") if st.button("Get Answer"): if prompt: with st.spinner("Fetching response..."): try: if not (INDEX and CHUNKS and MODEL): st.error("Data has not been processed yet. Please process the data first.") else: # Retrieve relevant chunks relevant_chunks = search_faiss(prompt, INDEX, CHUNKS, MODEL) context = "\n".join(relevant_chunks) # Query the LLM with context response = query_llm(prompt, context) st.write(response) except Exception as e: st.error(f"Error querying the model: {e}") else: st.warning("Please enter a query to continue.")