RAG-App / app.py
kinely's picture
Update app.py
da5bb34 verified
import os
import json
import pdfplumber
from groq import Groq
import streamlit as st
# Function to convert PDF to text using pdfplumber
def pdf_to_text(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
return text
# Function to convert extracted text into a structured JSON format
def text_to_json(text):
paragraphs = text.split("\n\n") # Split text into sections or paragraphs
json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]}
return json_data
# Function to restrict query results to the PDF dataset (returns relevant content)
def restrict_to_pdf_query(query, dataset):
relevant_content = []
query_lower = query.lower()
for section in dataset["dataset"]:
section_content = section["content"].lower()
# Check if the query is mentioned directly in the content
if query_lower in section_content:
relevant_content.append(section["content"])
return relevant_content if relevant_content else ["No relevant content found."]
# Function to split text into manageable chunks
def split_text_into_chunks(text, max_tokens=2000):
# Split text into chunks that fit within the model's token limit
chunks = []
current_chunk = ""
for paragraph in text.split("\n"):
# Check token length before adding paragraph
if len(current_chunk.split()) + len(paragraph.split()) > max_tokens:
chunks.append(current_chunk)
current_chunk = paragraph
else:
current_chunk += "\n" + paragraph
if current_chunk: # Add the last chunk
chunks.append(current_chunk)
return chunks
# Load the PDF, convert it to text, and create a JSON dataset
pdf_path = "PAKISTAN PENAL CODE.pdf" # Replace with the path to your PDF file
pdf_text = pdf_to_text(pdf_path)
dataset_json = text_to_json(pdf_text)
# Save the JSON dataset to a file
with open("dataset.json", "w") as f:
json.dump(dataset_json, f, indent=4)
# Set up the Groq client
client = Groq(
api_key=os.environ.get("GROQ_API_KEY"),
)
# Streamlit UI
st.title("RAG App Using Groq API")
user_query = st.text_input("Ask a question:")
if user_query:
# Load the dataset from the JSON file
with open("dataset.json", "r") as f:
dataset = json.load(f)
# Get the relevant content from the dataset based on the user's query
pdf_based_answer = restrict_to_pdf_query(user_query, dataset)
if pdf_based_answer[0] != "No relevant content found.":
# Combine all relevant content into one string (you can limit this further if needed)
relevant_text = "\n".join(pdf_based_answer)
# Split the relevant content into manageable chunks
chunks = split_text_into_chunks(relevant_text)
# Use only the first chunk (you can modify this to iterate over chunks or dynamically choose a chunk)
if chunks:
# Prepare a prompt that asks the model to act as an expert lawyer
prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}"""
# Request answer from the model
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": prompt,
}
],
model="llama3-groq-70b-8192-tool-use-preview", # Updated model
)
# Display the result
st.write(chat_completion.choices[0].message.content)
else:
st.write("Error: Unable to process content into chunks.")
else:
st.write("No relevant content found in the PDF dataset.")