Spaces:

kinely
/

RAG-App

Sleeping

File size: 4,074 Bytes

6c5a238
 
8888108
6c5a238
 
 
8888108
6c5a238
8888108
 
 
 
6c5a238
 
 
 
 
 
 
 
f1ec8ab
6c5a238
 
da5bb34
 
6c5a238
f1ec8ab
da5bb34
 
6c5a238
f1ec8ab
 
6c5a238
f383522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c5a238
004da1b
6c5a238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1ec8ab
 
 
f383522
f1ec8ab
 
6c5a238
f1ec8ab
 
da5bb34
 
 
 
f1ec8ab
 
 
 
da5bb34
f1ec8ab
 
 
 
 
 
 
 
 
f383522

import os
import json
import pdfplumber
from groq import Groq
import streamlit as st

# Function to convert PDF to text using pdfplumber
def pdf_to_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to convert extracted text into a structured JSON format
def text_to_json(text):
    paragraphs = text.split("\n\n")  # Split text into sections or paragraphs
    json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]}
    return json_data

# Function to restrict query results to the PDF dataset (returns relevant content)
def restrict_to_pdf_query(query, dataset):
    relevant_content = []
    query_lower = query.lower()

    for section in dataset["dataset"]:
        section_content = section["content"].lower()
        # Check if the query is mentioned directly in the content
        if query_lower in section_content:
            relevant_content.append(section["content"])

    return relevant_content if relevant_content else ["No relevant content found."]

# Function to split text into manageable chunks
def split_text_into_chunks(text, max_tokens=2000):
    # Split text into chunks that fit within the model's token limit
    chunks = []
    current_chunk = ""
    
    for paragraph in text.split("\n"):
        # Check token length before adding paragraph
        if len(current_chunk.split()) + len(paragraph.split()) > max_tokens:
            chunks.append(current_chunk)
            current_chunk = paragraph
        else:
            current_chunk += "\n" + paragraph
    
    if current_chunk:  # Add the last chunk
        chunks.append(current_chunk)
    
    return chunks

# Load the PDF, convert it to text, and create a JSON dataset
pdf_path = "PAKISTAN PENAL CODE.pdf"  # Replace with the path to your PDF file
pdf_text = pdf_to_text(pdf_path)
dataset_json = text_to_json(pdf_text)

# Save the JSON dataset to a file
with open("dataset.json", "w") as f:
    json.dump(dataset_json, f, indent=4)

# Set up the Groq client
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

# Streamlit UI
st.title("RAG App Using Groq API")
user_query = st.text_input("Ask a question:")

if user_query:
    # Load the dataset from the JSON file
    with open("dataset.json", "r") as f:
        dataset = json.load(f)

    # Get the relevant content from the dataset based on the user's query
    pdf_based_answer = restrict_to_pdf_query(user_query, dataset)

    if pdf_based_answer[0] != "No relevant content found.":
        # Combine all relevant content into one string (you can limit this further if needed)
        relevant_text = "\n".join(pdf_based_answer)

        # Split the relevant content into manageable chunks
        chunks = split_text_into_chunks(relevant_text)

        # Use only the first chunk (you can modify this to iterate over chunks or dynamically choose a chunk)
        if chunks:
            # Prepare a prompt that asks the model to act as an expert lawyer
            prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}"""
            
            # Request answer from the model
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama3-groq-70b-8192-tool-use-preview",  # Updated model
            )

            # Display the result
            st.write(chat_completion.choices[0].message.content)
        else:
            st.write("Error: Unable to process content into chunks.")
    else:
        st.write("No relevant content found in the PDF dataset.")