import os import json import pdfplumber from groq import Groq import streamlit as st # Function to convert PDF to text using pdfplumber def pdf_to_text(pdf_path): with pdfplumber.open(pdf_path) as pdf: text = "" for page in pdf.pages: text += page.extract_text() return text # Function to convert extracted text into a structured JSON format def text_to_json(text): paragraphs = text.split("\n\n") # Split text into sections or paragraphs json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]} return json_data # Function to restrict query results to the PDF dataset (returns relevant content) def restrict_to_pdf_query(query, dataset): relevant_content = [] query_lower = query.lower() for section in dataset["dataset"]: section_content = section["content"].lower() # Check if the query is mentioned directly in the content if query_lower in section_content: relevant_content.append(section["content"]) return relevant_content if relevant_content else ["No relevant content found."] # Function to split text into manageable chunks def split_text_into_chunks(text, max_tokens=2000): # Split text into chunks that fit within the model's token limit chunks = [] current_chunk = "" for paragraph in text.split("\n"): # Check token length before adding paragraph if len(current_chunk.split()) + len(paragraph.split()) > max_tokens: chunks.append(current_chunk) current_chunk = paragraph else: current_chunk += "\n" + paragraph if current_chunk: # Add the last chunk chunks.append(current_chunk) return chunks # Load the PDF, convert it to text, and create a JSON dataset pdf_path = "PAKISTAN PENAL CODE.pdf" # Replace with the path to your PDF file pdf_text = pdf_to_text(pdf_path) dataset_json = text_to_json(pdf_text) # Save the JSON dataset to a file with open("dataset.json", "w") as f: json.dump(dataset_json, f, indent=4) # Set up the Groq client client = Groq( api_key=os.environ.get("GROQ_API_KEY"), ) # Streamlit UI st.title("RAG App Using Groq API") user_query = st.text_input("Ask a question:") if user_query: # Load the dataset from the JSON file with open("dataset.json", "r") as f: dataset = json.load(f) # Get the relevant content from the dataset based on the user's query pdf_based_answer = restrict_to_pdf_query(user_query, dataset) if pdf_based_answer[0] != "No relevant content found.": # Combine all relevant content into one string (you can limit this further if needed) relevant_text = "\n".join(pdf_based_answer) # Split the relevant content into manageable chunks chunks = split_text_into_chunks(relevant_text) # Use only the first chunk (you can modify this to iterate over chunks or dynamically choose a chunk) if chunks: # Prepare a prompt that asks the model to act as an expert lawyer prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}""" # Request answer from the model chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": prompt, } ], model="llama3-groq-70b-8192-tool-use-preview", # Updated model ) # Display the result st.write(chat_completion.choices[0].message.content) else: st.write("Error: Unable to process content into chunks.") else: st.write("No relevant content found in the PDF dataset.")