Spaces:

kinely
/

RAG-App

Sleeping

App Files Files Community

RAG-App / app.py

kinely

Update app.py

da5bb34 verified about 1 year ago

raw

history blame contribute delete

4.07 kB

	import os
	import json
	import pdfplumber
	from groq import Groq
	import streamlit as st

	# Function to convert PDF to text using pdfplumber
	def pdf_to_text(pdf_path):
	with pdfplumber.open(pdf_path) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text()
	return text

	# Function to convert extracted text into a structured JSON format
	def text_to_json(text):
	paragraphs = text.split("\n\n") # Split text into sections or paragraphs
	json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]}
	return json_data

	# Function to restrict query results to the PDF dataset (returns relevant content)
	def restrict_to_pdf_query(query, dataset):
	relevant_content = []
	query_lower = query.lower()

	for section in dataset["dataset"]:
	section_content = section["content"].lower()
	# Check if the query is mentioned directly in the content
	if query_lower in section_content:
	relevant_content.append(section["content"])

	return relevant_content if relevant_content else ["No relevant content found."]

	# Function to split text into manageable chunks
	def split_text_into_chunks(text, max_tokens=2000):
	# Split text into chunks that fit within the model's token limit
	chunks = []
	current_chunk = ""

	for paragraph in text.split("\n"):
	# Check token length before adding paragraph
	if len(current_chunk.split()) + len(paragraph.split()) > max_tokens:
	chunks.append(current_chunk)
	current_chunk = paragraph
	else:
	current_chunk += "\n" + paragraph

	if current_chunk: # Add the last chunk
	chunks.append(current_chunk)

	return chunks

	# Load the PDF, convert it to text, and create a JSON dataset
	pdf_path = "PAKISTAN PENAL CODE.pdf" # Replace with the path to your PDF file
	pdf_text = pdf_to_text(pdf_path)
	dataset_json = text_to_json(pdf_text)

	# Save the JSON dataset to a file
	with open("dataset.json", "w") as f:
	json.dump(dataset_json, f, indent=4)

	# Set up the Groq client
	client = Groq(
	api_key=os.environ.get("GROQ_API_KEY"),
	)

	# Streamlit UI
	st.title("RAG App Using Groq API")
	user_query = st.text_input("Ask a question:")

	if user_query:
	# Load the dataset from the JSON file
	with open("dataset.json", "r") as f:
	dataset = json.load(f)

	# Get the relevant content from the dataset based on the user's query
	pdf_based_answer = restrict_to_pdf_query(user_query, dataset)

	if pdf_based_answer[0] != "No relevant content found.":
	# Combine all relevant content into one string (you can limit this further if needed)
	relevant_text = "\n".join(pdf_based_answer)

	# Split the relevant content into manageable chunks
	chunks = split_text_into_chunks(relevant_text)

	# Use only the first chunk (you can modify this to iterate over chunks or dynamically choose a chunk)
	if chunks:
	# Prepare a prompt that asks the model to act as an expert lawyer
	prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}"""

	# Request answer from the model
	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": prompt,
	}
	],
	model="llama3-groq-70b-8192-tool-use-preview", # Updated model
	)

	# Display the result
	st.write(chat_completion.choices[0].message.content)
	else:
	st.write("Error: Unable to process content into chunks.")
	else:
	st.write("No relevant content found in the PDF dataset.")