File size: 4,074 Bytes
6c5a238 8888108 6c5a238 8888108 6c5a238 8888108 6c5a238 f1ec8ab 6c5a238 da5bb34 6c5a238 f1ec8ab da5bb34 6c5a238 f1ec8ab 6c5a238 f383522 6c5a238 004da1b 6c5a238 f1ec8ab f383522 f1ec8ab 6c5a238 f1ec8ab da5bb34 f1ec8ab da5bb34 f1ec8ab f383522 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import os
import json
import pdfplumber
from groq import Groq
import streamlit as st
# Function to convert PDF to text using pdfplumber
def pdf_to_text(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
return text
# Function to convert extracted text into a structured JSON format
def text_to_json(text):
paragraphs = text.split("\n\n") # Split text into sections or paragraphs
json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]}
return json_data
# Function to restrict query results to the PDF dataset (returns relevant content)
def restrict_to_pdf_query(query, dataset):
relevant_content = []
query_lower = query.lower()
for section in dataset["dataset"]:
section_content = section["content"].lower()
# Check if the query is mentioned directly in the content
if query_lower in section_content:
relevant_content.append(section["content"])
return relevant_content if relevant_content else ["No relevant content found."]
# Function to split text into manageable chunks
def split_text_into_chunks(text, max_tokens=2000):
# Split text into chunks that fit within the model's token limit
chunks = []
current_chunk = ""
for paragraph in text.split("\n"):
# Check token length before adding paragraph
if len(current_chunk.split()) + len(paragraph.split()) > max_tokens:
chunks.append(current_chunk)
current_chunk = paragraph
else:
current_chunk += "\n" + paragraph
if current_chunk: # Add the last chunk
chunks.append(current_chunk)
return chunks
# Load the PDF, convert it to text, and create a JSON dataset
pdf_path = "PAKISTAN PENAL CODE.pdf" # Replace with the path to your PDF file
pdf_text = pdf_to_text(pdf_path)
dataset_json = text_to_json(pdf_text)
# Save the JSON dataset to a file
with open("dataset.json", "w") as f:
json.dump(dataset_json, f, indent=4)
# Set up the Groq client
client = Groq(
api_key=os.environ.get("GROQ_API_KEY"),
)
# Streamlit UI
st.title("RAG App Using Groq API")
user_query = st.text_input("Ask a question:")
if user_query:
# Load the dataset from the JSON file
with open("dataset.json", "r") as f:
dataset = json.load(f)
# Get the relevant content from the dataset based on the user's query
pdf_based_answer = restrict_to_pdf_query(user_query, dataset)
if pdf_based_answer[0] != "No relevant content found.":
# Combine all relevant content into one string (you can limit this further if needed)
relevant_text = "\n".join(pdf_based_answer)
# Split the relevant content into manageable chunks
chunks = split_text_into_chunks(relevant_text)
# Use only the first chunk (you can modify this to iterate over chunks or dynamically choose a chunk)
if chunks:
# Prepare a prompt that asks the model to act as an expert lawyer
prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}"""
# Request answer from the model
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": prompt,
}
],
model="llama3-groq-70b-8192-tool-use-preview", # Updated model
)
# Display the result
st.write(chat_completion.choices[0].message.content)
else:
st.write("Error: Unable to process content into chunks.")
else:
st.write("No relevant content found in the PDF dataset.")
|