PakistanBill / app.py
Engineer786's picture
Update app.py
b5b64fb verified
import os
import streamlit as st
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
from groq import Groq
# Fetch API key from environment variable
API_KEY = os.environ.get('GroqApi')
global CHUNKS, INDEX, MODEL
# Initialize global variables
CHUNKS = None
INDEX = None
MODEL = None
# Function to scrape tariff data
def scrape_tariff_data(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
tariff_data = []
for paragraph in soup.find_all('p'):
tariff_data.append(paragraph.text.strip())
return "\n".join(tariff_data)
# Function to chunk text into manageable sizes
def chunk_text(text, max_length=512):
words = text.split()
chunks = []
for i in range(0, len(words), max_length):
chunks.append(" ".join(words[i:i+max_length]))
return chunks
# Function to create embeddings and FAISS index
def create_faiss_index(chunks, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
embeddings = model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index, embeddings, model
# Function to search FAISS for relevant chunks
def search_faiss(query, index, chunks, model, top_k=5):
query_embedding = model.encode([query])
distances, indices = index.search(query_embedding, top_k)
relevant_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]
return relevant_chunks
# Function to query the Groq API with augmented query
def query_llm(prompt, context):
if not API_KEY:
return "Error: GROQ_API_KEY is not set in environment variables."
client = Groq(api_key=API_KEY)
augmented_prompt = f"Based on the following data:\n\n{context}\n\nAnswer the question: {prompt}"
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": augmented_prompt,
}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Streamlit UI
st.title("RAG-Based Tariff Data Application")
url = st.text_input("Enter Tariff Data URL", "https://iesco.com.pk/index.php/customer-services/tariff-guide")
if st.button("Process Tariff Data"):
with st.spinner("Extracting and processing data..."):
try:
#global CHUNKS, INDEX, MODEL # Declare globals before modifying
text = scrape_tariff_data(url)
if not text:
st.error("Failed to scrape data from the provided URL.")
st.stop()
CHUNKS = chunk_text(text)
if not CHUNKS:
st.error("No data available for processing.")
st.stop()
INDEX, embeddings, MODEL = create_faiss_index(CHUNKS)
if not INDEX:
st.error("Failed to create FAISS index.")
st.stop()
st.success("Data processed and indexed!")
st.write("Number of chunks processed:", len(CHUNKS))
except Exception as e:
st.error(f"Error processing data: {e}")
st.header("Query the Tariff Data")
prompt = st.text_input("Enter your query")
if st.button("Get Answer"):
if prompt:
with st.spinner("Fetching response..."):
try:
if not (INDEX and CHUNKS and MODEL):
st.error("Data has not been processed yet. Please process the data first.")
else:
# Retrieve relevant chunks
relevant_chunks = search_faiss(prompt, INDEX, CHUNKS, MODEL)
context = "\n".join(relevant_chunks)
# Query the LLM with context
response = query_llm(prompt, context)
st.write(response)
except Exception as e:
st.error(f"Error querying the model: {e}")
else:
st.warning("Please enter a query to continue.")