Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| from groq import Groq | |
| # Fetch API key from environment variable | |
| API_KEY = os.environ.get('GroqApi') | |
| global CHUNKS, INDEX, MODEL | |
| # Initialize global variables | |
| CHUNKS = None | |
| INDEX = None | |
| MODEL = None | |
| # Function to scrape tariff data | |
| def scrape_tariff_data(url): | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| tariff_data = [] | |
| for paragraph in soup.find_all('p'): | |
| tariff_data.append(paragraph.text.strip()) | |
| return "\n".join(tariff_data) | |
| # Function to chunk text into manageable sizes | |
| def chunk_text(text, max_length=512): | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), max_length): | |
| chunks.append(" ".join(words[i:i+max_length])) | |
| return chunks | |
| # Function to create embeddings and FAISS index | |
| def create_faiss_index(chunks, model_name='all-MiniLM-L6-v2'): | |
| model = SentenceTransformer(model_name) | |
| embeddings = model.encode(chunks) | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| return index, embeddings, model | |
| # Function to search FAISS for relevant chunks | |
| def search_faiss(query, index, chunks, model, top_k=5): | |
| query_embedding = model.encode([query]) | |
| distances, indices = index.search(query_embedding, top_k) | |
| relevant_chunks = [chunks[i] for i in indices[0] if i < len(chunks)] | |
| return relevant_chunks | |
| # Function to query the Groq API with augmented query | |
| def query_llm(prompt, context): | |
| if not API_KEY: | |
| return "Error: GROQ_API_KEY is not set in environment variables." | |
| client = Groq(api_key=API_KEY) | |
| augmented_prompt = f"Based on the following data:\n\n{context}\n\nAnswer the question: {prompt}" | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": augmented_prompt, | |
| } | |
| ], | |
| model="llama3-8b-8192", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit UI | |
| st.title("RAG-Based Tariff Data Application") | |
| url = st.text_input("Enter Tariff Data URL", "https://iesco.com.pk/index.php/customer-services/tariff-guide") | |
| if st.button("Process Tariff Data"): | |
| with st.spinner("Extracting and processing data..."): | |
| try: | |
| #global CHUNKS, INDEX, MODEL # Declare globals before modifying | |
| text = scrape_tariff_data(url) | |
| if not text: | |
| st.error("Failed to scrape data from the provided URL.") | |
| st.stop() | |
| CHUNKS = chunk_text(text) | |
| if not CHUNKS: | |
| st.error("No data available for processing.") | |
| st.stop() | |
| INDEX, embeddings, MODEL = create_faiss_index(CHUNKS) | |
| if not INDEX: | |
| st.error("Failed to create FAISS index.") | |
| st.stop() | |
| st.success("Data processed and indexed!") | |
| st.write("Number of chunks processed:", len(CHUNKS)) | |
| except Exception as e: | |
| st.error(f"Error processing data: {e}") | |
| st.header("Query the Tariff Data") | |
| prompt = st.text_input("Enter your query") | |
| if st.button("Get Answer"): | |
| if prompt: | |
| with st.spinner("Fetching response..."): | |
| try: | |
| if not (INDEX and CHUNKS and MODEL): | |
| st.error("Data has not been processed yet. Please process the data first.") | |
| else: | |
| # Retrieve relevant chunks | |
| relevant_chunks = search_faiss(prompt, INDEX, CHUNKS, MODEL) | |
| context = "\n".join(relevant_chunks) | |
| # Query the LLM with context | |
| response = query_llm(prompt, context) | |
| st.write(response) | |
| except Exception as e: | |
| st.error(f"Error querying the model: {e}") | |
| else: | |
| st.warning("Please enter a query to continue.") | |