File size: 5,225 Bytes
9ef8237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
from dotenv import load_dotenv
import ollama
from PyPDF2 import PdfReader
from google import generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import pandas as pd
from streamlit import progress

# Load Environment and Set API Key
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# Specify model
MODEL_NAME = "llama3.1"


# PDF Processing Class
class Database:
    def __init__(self, pdf_docs):
        self.pdf_docs = pdf_docs

    def _pdf_to_text(self):
        # Efficiently extract text from all pages in all PDF files
        self.text = "".join(
            page.extract_text()
            for pdf in self.pdf_docs
            for page in PdfReader(pdf).pages
        )

    def _text_to_chunks(self):
        # Split text into manageable chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        self.chunks = text_splitter.split_text(self.text)

    def _vectorstore(self):
        # Save vectorized chunks for later retrieval
        vectorstore = FAISS.from_texts(self.chunks, embeddings)
        vectorstore.save_local("faiss_index")
        print("Vector embeddings saved")

    def store(self):
        self._pdf_to_text()
        self._text_to_chunks()
        self._vectorstore()


# Context Retrieval Class
class Context:
    def __init__(self, topic):
        self.topic = topic

    def redefine(self):
        prompt_redefine = f"""
        You are an assistant creating queries for vector database retrieval based on topics. Given the Topic: '{self.topic}',
        return only the clarified query.
        """
        redefined_response = ollama.generate(model=MODEL_NAME, prompt=prompt_redefine)
        self.clarified_query = redefined_response["response"]
        return self.clarified_query

    def retrieve_faiss(self, query):
        new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
        docs = new_db.similarity_search(query)
        pdf_docs = [doc.page_content for doc in docs]

        os.makedirs('log', exist_ok=True)
        with open("log/Retrieval_log.txt", "w") as file:
            file.write(f"Clarified Query: {query}\n")
            for i, pdf_doc in enumerate(pdf_docs, start=1):
                file.write(f"Document {i}: {pdf_doc}\n")
        return docs


# Question Generation Class
class QuestionGeneration:
    def __init__(self, context, num_questions, question_type, conditions):
        self.context = context
        self.num_questions = num_questions
        self.question_type = question_type
        self.conditions = conditions

    def generate(self):
        prompt = f"""
        Generate {self.num_questions} questions based on the context provided.

        Context: {self.context}
        Total Questions: {self.num_questions}
        Question Type: {self.question_type}
        Conditions: {self.conditions}

        Provide the questions without any numbering or introduction.
        """
        response = ollama.generate(model=MODEL_NAME, prompt=prompt)
        if MODEL_NAME == "llama3.2":
            questions = response["response"].split('\n\n')
        elif MODEL_NAME == "llama3.1":
            questions = response["response"].split('\n')

        print("Question generation successful")
        return len(questions), questions


# Answer Generation Class
class AnswerGeneration:
    def __init__(self, context, questions, question_type, conditions,percentage_text=None,progress_bar=None):
        self.context = context
        self.questions = questions
        self.question_type = question_type
        self.conditions = conditions
        self.progress_bar = progress_bar
        self.percentage_text = percentage_text

    def generate(self):
        answers = []
        for i, question in enumerate(self.questions):
            prompt = f"""
            Answer the question: {question} using the following context: {self.context}

            Answer Type: {self.question_type}
            Conditions: {self.conditions}

            Directly provide the answer, without any formatting or symbols.
            """
            response = ollama.generate(model=MODEL_NAME, prompt=prompt)
            answer = response["response"].replace('\n', ' ').replace('**', ' ')
            print(f"Q{i}: Answer generation successful")
            answers.append(answer)
            if self.progress_bar:
                progress =(i+1) /  len(self.questions)
                self.progress_bar.progress(progress)
                self.percentage_text.text(f"Progress: {int(progress * 100)}%")
        return answers


# Function to Convert Q&A to CSV
def create_csv(questions, answers, topic):
    os.makedirs('csv', exist_ok=True)  # Efficient folder creation

    # Create DataFrame and Save as CSV
    df = pd.DataFrame({'Question': questions, 'Answer': answers})
    file_path = f"csv/Synthetic_Dataset_{topic}.csv"
    df.to_csv(file_path, index=False)

    print(df.head())
    return file_path,df