|
|
import os |
|
|
import json |
|
|
import pdfplumber |
|
|
from groq import Groq |
|
|
import streamlit as st |
|
|
|
|
|
|
|
|
def pdf_to_text(pdf_path): |
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
|
text = "" |
|
|
for page in pdf.pages: |
|
|
text += page.extract_text() |
|
|
return text |
|
|
|
|
|
|
|
|
def text_to_json(text): |
|
|
paragraphs = text.split("\n\n") |
|
|
json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]} |
|
|
return json_data |
|
|
|
|
|
|
|
|
def restrict_to_pdf_query(query, dataset): |
|
|
relevant_content = [] |
|
|
query_lower = query.lower() |
|
|
|
|
|
for section in dataset["dataset"]: |
|
|
section_content = section["content"].lower() |
|
|
|
|
|
if query_lower in section_content: |
|
|
relevant_content.append(section["content"]) |
|
|
|
|
|
return relevant_content if relevant_content else ["No relevant content found."] |
|
|
|
|
|
|
|
|
def split_text_into_chunks(text, max_tokens=2000): |
|
|
|
|
|
chunks = [] |
|
|
current_chunk = "" |
|
|
|
|
|
for paragraph in text.split("\n"): |
|
|
|
|
|
if len(current_chunk.split()) + len(paragraph.split()) > max_tokens: |
|
|
chunks.append(current_chunk) |
|
|
current_chunk = paragraph |
|
|
else: |
|
|
current_chunk += "\n" + paragraph |
|
|
|
|
|
if current_chunk: |
|
|
chunks.append(current_chunk) |
|
|
|
|
|
return chunks |
|
|
|
|
|
|
|
|
pdf_path = "PAKISTAN PENAL CODE.pdf" |
|
|
pdf_text = pdf_to_text(pdf_path) |
|
|
dataset_json = text_to_json(pdf_text) |
|
|
|
|
|
|
|
|
with open("dataset.json", "w") as f: |
|
|
json.dump(dataset_json, f, indent=4) |
|
|
|
|
|
|
|
|
client = Groq( |
|
|
api_key=os.environ.get("GROQ_API_KEY"), |
|
|
) |
|
|
|
|
|
|
|
|
st.title("RAG App Using Groq API") |
|
|
user_query = st.text_input("Ask a question:") |
|
|
|
|
|
if user_query: |
|
|
|
|
|
with open("dataset.json", "r") as f: |
|
|
dataset = json.load(f) |
|
|
|
|
|
|
|
|
pdf_based_answer = restrict_to_pdf_query(user_query, dataset) |
|
|
|
|
|
if pdf_based_answer[0] != "No relevant content found.": |
|
|
|
|
|
relevant_text = "\n".join(pdf_based_answer) |
|
|
|
|
|
|
|
|
chunks = split_text_into_chunks(relevant_text) |
|
|
|
|
|
|
|
|
if chunks: |
|
|
|
|
|
prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}""" |
|
|
|
|
|
|
|
|
chat_completion = client.chat.completions.create( |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt, |
|
|
} |
|
|
], |
|
|
model="llama3-groq-70b-8192-tool-use-preview", |
|
|
) |
|
|
|
|
|
|
|
|
st.write(chat_completion.choices[0].message.content) |
|
|
else: |
|
|
st.write("Error: Unable to process content into chunks.") |
|
|
else: |
|
|
st.write("No relevant content found in the PDF dataset.") |
|
|
|