import os
import json
import pdfplumber
from groq import Groq
import streamlit as st

# Function to convert PDF to text using pdfplumber
def pdf_to_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to convert extracted text into a structured JSON format
def text_to_json(text):
    paragraphs = text.split("\n\n")  # Split text into sections or paragraphs
    json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]}
    return json_data

# Function to restrict query results to the PDF dataset (returns relevant content)
def restrict_to_pdf_query(query, dataset):
    relevant_content = []
    query_lower = query.lower()

    for section in dataset["dataset"]:
        section_content = section["content"].lower()
        # Check if the query is mentioned directly in the content
        if query_lower in section_content:
            relevant_content.append(section["content"])

    return relevant_content if relevant_content else ["No relevant content found."]

# Function to split text into manageable chunks
def split_text_into_chunks(text, max_tokens=2000):
    # Split text into chunks that fit within the model's token limit
    chunks = []
    current_chunk = ""
    
    for paragraph in text.split("\n"):
        # Check token length before adding paragraph
        if len(current_chunk.split()) + len(paragraph.split()) > max_tokens:
            chunks.append(current_chunk)
            current_chunk = paragraph
        else:
            current_chunk += "\n" + paragraph
    
    if current_chunk:  # Add the last chunk
        chunks.append(current_chunk)
    
    return chunks

# Load the PDF, convert it to text, and create a JSON dataset
pdf_path = "PAKISTAN PENAL CODE.pdf"  # Replace with the path to your PDF file
pdf_text = pdf_to_text(pdf_path)
dataset_json = text_to_json(pdf_text)

# Save the JSON dataset to a file
with open("dataset.json", "w") as f:
    json.dump(dataset_json, f, indent=4)

# Set up the Groq client
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

# Streamlit UI
st.title("RAG App Using Groq API")
user_query = st.text_input("Ask a question:")

if user_query:
    # Load the dataset from the JSON file
    with open("dataset.json", "r") as f:
        dataset = json.load(f)

    # Get the relevant content from the dataset based on the user's query
    pdf_based_answer = restrict_to_pdf_query(user_query, dataset)

    if pdf_based_answer[0] != "No relevant content found.":
        # Combine all relevant content into one string (you can limit this further if needed)
        relevant_text = "\n".join(pdf_based_answer)

        # Split the relevant content into manageable chunks
        chunks = split_text_into_chunks(relevant_text)

        # Use only the first chunk (you can modify this to iterate over chunks or dynamically choose a chunk)
        if chunks:
            # Prepare a prompt that asks the model to act as an expert lawyer
            prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}"""
            
            # Request answer from the model
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama3-groq-70b-8192-tool-use-preview",  # Updated model
            )

            # Display the result
            st.write(chat_completion.choices[0].message.content)
        else:
            st.write("Error: Unable to process content into chunks.")
    else:
        st.write("No relevant content found in the PDF dataset.")