HafeezBavikadi's picture
Upload 25 files
9ef8237 verified
import os
from dotenv import load_dotenv
import ollama
from PyPDF2 import PdfReader
from google import generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import pandas as pd
from streamlit import progress
# Load Environment and Set API Key
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
# Specify model
MODEL_NAME = "llama3.1"
# PDF Processing Class
class Database:
def __init__(self, pdf_docs):
self.pdf_docs = pdf_docs
def _pdf_to_text(self):
# Efficiently extract text from all pages in all PDF files
self.text = "".join(
page.extract_text()
for pdf in self.pdf_docs
for page in PdfReader(pdf).pages
)
def _text_to_chunks(self):
# Split text into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
self.chunks = text_splitter.split_text(self.text)
def _vectorstore(self):
# Save vectorized chunks for later retrieval
vectorstore = FAISS.from_texts(self.chunks, embeddings)
vectorstore.save_local("faiss_index")
print("Vector embeddings saved")
def store(self):
self._pdf_to_text()
self._text_to_chunks()
self._vectorstore()
# Context Retrieval Class
class Context:
def __init__(self, topic):
self.topic = topic
def redefine(self):
prompt_redefine = f"""
You are an assistant creating queries for vector database retrieval based on topics. Given the Topic: '{self.topic}',
return only the clarified query.
"""
redefined_response = ollama.generate(model=MODEL_NAME, prompt=prompt_redefine)
self.clarified_query = redefined_response["response"]
return self.clarified_query
def retrieve_faiss(self, query):
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(query)
pdf_docs = [doc.page_content for doc in docs]
os.makedirs('log', exist_ok=True)
with open("log/Retrieval_log.txt", "w") as file:
file.write(f"Clarified Query: {query}\n")
for i, pdf_doc in enumerate(pdf_docs, start=1):
file.write(f"Document {i}: {pdf_doc}\n")
return docs
# Question Generation Class
class QuestionGeneration:
def __init__(self, context, num_questions, question_type, conditions):
self.context = context
self.num_questions = num_questions
self.question_type = question_type
self.conditions = conditions
def generate(self):
prompt = f"""
Generate {self.num_questions} questions based on the context provided.
Context: {self.context}
Total Questions: {self.num_questions}
Question Type: {self.question_type}
Conditions: {self.conditions}
Provide the questions without any numbering or introduction.
"""
response = ollama.generate(model=MODEL_NAME, prompt=prompt)
if MODEL_NAME == "llama3.2":
questions = response["response"].split('\n\n')
elif MODEL_NAME == "llama3.1":
questions = response["response"].split('\n')
print("Question generation successful")
return len(questions), questions
# Answer Generation Class
class AnswerGeneration:
def __init__(self, context, questions, question_type, conditions,percentage_text=None,progress_bar=None):
self.context = context
self.questions = questions
self.question_type = question_type
self.conditions = conditions
self.progress_bar = progress_bar
self.percentage_text = percentage_text
def generate(self):
answers = []
for i, question in enumerate(self.questions):
prompt = f"""
Answer the question: {question} using the following context: {self.context}
Answer Type: {self.question_type}
Conditions: {self.conditions}
Directly provide the answer, without any formatting or symbols.
"""
response = ollama.generate(model=MODEL_NAME, prompt=prompt)
answer = response["response"].replace('\n', ' ').replace('**', ' ')
print(f"Q{i}: Answer generation successful")
answers.append(answer)
if self.progress_bar:
progress =(i+1) / len(self.questions)
self.progress_bar.progress(progress)
self.percentage_text.text(f"Progress: {int(progress * 100)}%")
return answers
# Function to Convert Q&A to CSV
def create_csv(questions, answers, topic):
os.makedirs('csv', exist_ok=True) # Efficient folder creation
# Create DataFrame and Save as CSV
df = pd.DataFrame({'Question': questions, 'Answer': answers})
file_path = f"csv/Synthetic_Dataset_{topic}.csv"
df.to_csv(file_path, index=False)
print(df.head())
return file_path,df