Spaces:

Engineer786
/

PakistanBill

Sleeping

File size: 4,051 Bytes

import os
import streamlit as st
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
from groq import Groq

# Fetch API key from environment variable
API_KEY = os.environ.get('GroqApi')

global CHUNKS, INDEX, MODEL 
# Initialize global variables
CHUNKS = None
INDEX = None
MODEL = None

# Function to scrape tariff data
def scrape_tariff_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tariff_data = []
    for paragraph in soup.find_all('p'):
        tariff_data.append(paragraph.text.strip())
    return "\n".join(tariff_data)

# Function to chunk text into manageable sizes
def chunk_text(text, max_length=512):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunks.append(" ".join(words[i:i+max_length]))
    return chunks

# Function to create embeddings and FAISS index
def create_faiss_index(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, embeddings, model

# Function to search FAISS for relevant chunks
def search_faiss(query, index, chunks, model, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    relevant_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]
    return relevant_chunks

# Function to query the Groq API with augmented query
def query_llm(prompt, context):
    if not API_KEY:
        return "Error: GROQ_API_KEY is not set in environment variables."
    
    client = Groq(api_key=API_KEY)
    augmented_prompt = f"Based on the following data:\n\n{context}\n\nAnswer the question: {prompt}"
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": augmented_prompt,
            }
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

# Streamlit UI
st.title("RAG-Based Tariff Data Application")

url = st.text_input("Enter Tariff Data URL", "https://iesco.com.pk/index.php/customer-services/tariff-guide")

if st.button("Process Tariff Data"):
    with st.spinner("Extracting and processing data..."):
        try:
            #global CHUNKS, INDEX, MODEL  # Declare globals before modifying
            text = scrape_tariff_data(url)
            if not text:
                st.error("Failed to scrape data from the provided URL.")
                st.stop()

            CHUNKS = chunk_text(text)
            if not CHUNKS:
                st.error("No data available for processing.")
                st.stop()

            INDEX, embeddings, MODEL = create_faiss_index(CHUNKS)
            if not INDEX:
                st.error("Failed to create FAISS index.")
                st.stop()

            st.success("Data processed and indexed!")
            st.write("Number of chunks processed:", len(CHUNKS))

        except Exception as e:
            st.error(f"Error processing data: {e}")

st.header("Query the Tariff Data")
prompt = st.text_input("Enter your query")

if st.button("Get Answer"):
    if prompt:
        with st.spinner("Fetching response..."):
            try:
                if not (INDEX and CHUNKS and MODEL):
                    st.error("Data has not been processed yet. Please process the data first.")
                else:
                    # Retrieve relevant chunks
                    relevant_chunks = search_faiss(prompt, INDEX, CHUNKS, MODEL)
                    context = "\n".join(relevant_chunks)

                    # Query the LLM with context
                    response = query_llm(prompt, context)
                    st.write(response)
            except Exception as e:
                st.error(f"Error querying the model: {e}")
    else:
        st.warning("Please enter a query to continue.")