HafeezBavikadi commited on May 5, 2025

Commit

9ef8237

verified ·

1 Parent(s): a1d5387

Upload 25 files

Browse files

Files changed (25) hide show

Synthetic_data_generation-main/.DS_Store +0 -0
Synthetic_data_generation-main/.env +1 -0
Synthetic_data_generation-main/.idea/.gitignore +8 -0
Synthetic_data_generation-main/.idea/Synthetic_data_generation.iml +10 -0
Synthetic_data_generation-main/.idea/inspectionProfiles/profiles_settings.xml +6 -0
Synthetic_data_generation-main/.idea/misc.xml +7 -0
Synthetic_data_generation-main/.idea/modules.xml +8 -0
Synthetic_data_generation-main/.idea/vcs.xml +6 -0
Synthetic_data_generation-main/Dockerfile +14 -0
Synthetic_data_generation-main/README.md +1 -0
Synthetic_data_generation-main/Retrieval_log.txt +29 -0
Synthetic_data_generation-main/Retrival_log.txt +25 -0
Synthetic_data_generation-main/Test.ipynb +99 -0
Synthetic_data_generation-main/backend.py +149 -0
Synthetic_data_generation-main/csv/Synthetic_Dataset_Machine Learning .csv +9 -0
Synthetic_data_generation-main/csv/Synthetic_Dataset_Robust Transformer-based TTS.csv +6 -0
Synthetic_data_generation-main/csv/Synthetic_Dataset_Unconstrained Encoder-decoder Attention.csv +10 -0
Synthetic_data_generation-main/csv/Synthetic_Dataset_encoder-decoder.csv +7 -0
Synthetic_data_generation-main/faiss_index/index.faiss +0 -0
Synthetic_data_generation-main/faiss_index/index.pkl +3 -0
Synthetic_data_generation-main/frontend.py +102 -0
Synthetic_data_generation-main/log/Retrieval_log.txt +5 -0
Synthetic_data_generation-main/logo.jpeg +0 -0
Synthetic_data_generation-main/main.py +77 -0
Synthetic_data_generation-main/requirements.txt +96 -0

Synthetic_data_generation-main/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Synthetic_data_generation-main/.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GOOGLE_API_KEY = "AIzaSyBzoR2i4jQgQBRCsP0QMOct9sXqVEAXACA"

Synthetic_data_generation-main/.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

Synthetic_data_generation-main/.idea/Synthetic_data_generation.iml ADDED Viewed

	@@ -0,0 +1,10 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

Synthetic_data_generation-main/.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

Synthetic_data_generation-main/.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.9 (Synthetic_data_generation)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (Synthetic_data_generation)" project-jdk-type="Python SDK" />
+</project>

Synthetic_data_generation-main/.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Synthetic_data_generation.iml" filepath="$PROJECT_DIR$/.idea/Synthetic_data_generation.iml" />
+    </modules>
+  </component>
+</project>

Synthetic_data_generation-main/.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

Synthetic_data_generation-main/Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM python:3.9
+WORKDIR /app
+COPY requirements.txt /app
+COPY main.py /app
+COPY logo.jpeg /app
+COPY backend.py /app
+COPY .env /app
+RUN pip install -r requirements.txt
+ENTRYPOINT ["streamlit","run " ,"main.py"]

Synthetic_data_generation-main/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Synthetic_data_generation

Synthetic_data_generation-main/Retrieval_log.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+Clarified Query: SELECT * FROM "VectorDatabase" WHERE `topic` = "encoder-decoder";
+Document 1: rid of abnormal cases, instead gives rise to other issues suchas higher speech rate and weird rhythm.
+In this paper, we remove the encoder-decoder attention
+and apply a duration-based hard attention to copy encoderhidden states to their corresponding frames, forcing the de-
+1Such as URL, a sequence of numbers, and other texts which
+are out of the domain of the training data
+8228coder to generate correct content. To have a holistic view
+of the whole input as the original attention mechanism, we
+replace the causal self-attention layer in the decoder with a
+Document 2: mation is injected by adding two position embeddings to theoutput of the encoder and decoder pre-nets respectively. Theencoder is built with stacks of several identity blocks, eachcontains two sub-networks: a self-attention and a feed for-ward network. The decoder has the similar structure, whilethe self-attention is causal to attend to only the previouslydecoded frames, and an extra encoder-decoder attention isleveraged to attend to encoder hidden states.
+Based on the ﬁnal hidden states of the decoder, mel spec-
+trum frames are generated autogressively with a linear layer
+Figure 1: Architecture of TransfomerTTS.
+followed by a post-net, which stops when a stop token is
+predicted by a separate linear projection.
+Similar to Tacotron2, TransformerTTS also borrows tech-
+Document 3: 3.3 Pseudo Non-causal Attention
+As discussed in Section 2.1, the encoder-decoder attentionmechanism is a crucial factor for the instability. However,simply removing this attention will also discard the advan-tages it brings to the TTS model. The advantages can beconsidered as the following two aspects. On the one hand,the encoder-decoder attention provides a holistic view of in-put sequence for the decoder, while on the other hand, itcomposes frame-level context vectors according to decoderinputs (which are mel frames). These two advantages makegreat contribution to the decoding procedure, and we pro-pose ”pseudo non-causal attention” (PNCA) to replace thecausal self-attention layers as shown in Figure 4, which notonly inherits the two features above, but also makes the de-coding procedure robust.
+LetTbe the total length of mel spectrum to be decoded,
+x
+l
+ibe the autoregressive output of step iand layer l,hibe
+the tiled encoder hidden state of step i. For the time step
+Document 4: RobuTrans differs from TransformerTTS in following as-
+pects: 1) The input of Encoder is linguistic features, whichconsists of phonemic and prosodic features; 2) The positionembedding in the Encoder and Decoder is removed; 3) Theencoder-decoder attention is replaced with a duration basedhard attention; 4) The causal self-attention in Decoder is re-placed with pseudo non-causal attention.
+82303.1 Text-to-Linguistic-Feature Converter
+We ﬁrst convert the input text into linguistic features, which
+consist of phonemic and prosodic features and then con-sumed by Encoder. To obtain the phonemic features, a rule-based system is used for the grapheme-to-phoneme conver-sion, which generates the phonemic categorical features
+2.

Synthetic_data_generation-main/Retrival_log.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+ Clarified Query:Here is the query:
+`"encoder-decoder attention mechanism without position encoding constraint"`Document 1:the third word at the third step, while on the other hand, af-
+ter attending to the forth word at the fourth step, it attendsto the third word again. These two cases deﬁnitely outputbad cases. Although some variation of attention mechanisms(e.g. forward attention) has tried to construct a monotonouscontinuous correspondence between encoder and decoder,they cannot completely eliminate bad cases. Details will beshown in Section 4.6.
+2.2 Imprecise Stop PredictionDocument 2:thei-th step attends to the j-th word at the source side,
+the(i+1 ) -th step must attend to the (j+n)-th word
+(1≥n≥0), as shown in the left picture in Figure 2.
+Previous models ignore this constraint, and learn the
+alignment from the data totally, resulting in incorrect align-ments for special inputs. The right picture in Figure 2 showsan example of an abnormal alignment. On the one hand, theattention mechanism skips the second word and attends to
+8229Figure 2: Normal and abnormal alignments of encoder-
+decoder attention. Mel spectrum frames (queries) are rangedhorizontally, while encoder hidden states (keys) are vertical.Left: normal alignment; the focus along keys are continuousand monotonous. Right: Abnormal alignment; the red linerepresents the skipping as well as retreating advance.
+the third word at the third step, while on the other hand, af-Document 3:3.3 Pseudo Non-causal Attention
+As discussed in Section 2.1, the encoder-decoder attentionmechanism is a crucial factor for the instability. However,simply removing this attention will also discard the advan-tages it brings to the TTS model. The advantages can beconsidered as the following two aspects. On the one hand,the encoder-decoder attention provides a holistic view of in-put sequence for the decoder, while on the other hand, itcomposes frame-level context vectors according to decoderinputs (which are mel frames). These two advantages makegreat contribution to the decoding procedure, and we pro-pose ”pseudo non-causal attention” (PNCA) to replace thecausal self-attention layers as shown in Figure 4, which notonly inherits the two features above, but also makes the de-coding procedure robust.
+LetTbe the total length of mel spectrum to be decoded,
+x
+l
+ibe the autoregressive output of step iand layer l,hibe
+the tiled encoder hidden state of step i. For the time stepDocument 4:rid of abnormal cases, instead gives rise to other issues suchas higher speech rate and weird rhythm.
+In this paper, we remove the encoder-decoder attention
+and apply a duration-based hard attention to copy encoderhidden states to their corresponding frames, forcing the de-
+1Such as URL, a sequence of numbers, and other texts which
+are out of the domain of the training data
+8228coder to generate correct content. To have a holistic view
+of the whole input as the original attention mechanism, we
+replace the causal self-attention layer in the decoder with a

Synthetic_data_generation-main/Test.ipynb ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": "import "
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "ff1c18b395292c03"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "bc53fc83faa7ed78"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "5b0aa1cee4c0030e"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "e3cacb52b5b83314"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "7821460893af6a94"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "c2ed44c77bd16a1d"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "e6abdae43b6d3e6b"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "f09de4d352e263e3"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Synthetic_data_generation-main/backend.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+from dotenv import load_dotenv
+import ollama
+from PyPDF2 import PdfReader
+from google import generativeai as genai
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import pandas as pd
+from streamlit import progress
+# Load Environment and Set API Key
+load_dotenv()
+genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
+# Specify model
+MODEL_NAME = "llama3.1"
+# PDF Processing Class
+class Database:
+    def __init__(self, pdf_docs):
+        self.pdf_docs = pdf_docs
+    def _pdf_to_text(self):
+        # Efficiently extract text from all pages in all PDF files
+        self.text = "".join(
+            page.extract_text()
+            for pdf in self.pdf_docs
+            for page in PdfReader(pdf).pages
+        )
+    def _text_to_chunks(self):
+        # Split text into manageable chunks
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+        self.chunks = text_splitter.split_text(self.text)
+    def _vectorstore(self):
+        # Save vectorized chunks for later retrieval
+        vectorstore = FAISS.from_texts(self.chunks, embeddings)
+        vectorstore.save_local("faiss_index")
+        print("Vector embeddings saved")
+    def store(self):
+        self._pdf_to_text()
+        self._text_to_chunks()
+        self._vectorstore()
+# Context Retrieval Class
+class Context:
+    def __init__(self, topic):
+        self.topic = topic
+    def redefine(self):
+        prompt_redefine = f"""
+        You are an assistant creating queries for vector database retrieval based on topics. Given the Topic: '{self.topic}',
+        return only the clarified query.
+        """
+        redefined_response = ollama.generate(model=MODEL_NAME, prompt=prompt_redefine)
+        self.clarified_query = redefined_response["response"]
+        return self.clarified_query
+    def retrieve_faiss(self, query):
+        new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
+        docs = new_db.similarity_search(query)
+        pdf_docs = [doc.page_content for doc in docs]
+        os.makedirs('log', exist_ok=True)
+        with open("log/Retrieval_log.txt", "w") as file:
+            file.write(f"Clarified Query: {query}\n")
+            for i, pdf_doc in enumerate(pdf_docs, start=1):
+                file.write(f"Document {i}: {pdf_doc}\n")
+        return docs
+# Question Generation Class
+class QuestionGeneration:
+    def __init__(self, context, num_questions, question_type, conditions):
+        self.context = context
+        self.num_questions = num_questions
+        self.question_type = question_type
+        self.conditions = conditions
+    def generate(self):
+        prompt = f"""
+        Generate {self.num_questions} questions based on the context provided.
+        Context: {self.context}
+        Total Questions: {self.num_questions}
+        Question Type: {self.question_type}
+        Conditions: {self.conditions}
+        Provide the questions without any numbering or introduction.
+        """
+        response = ollama.generate(model=MODEL_NAME, prompt=prompt)
+        if MODEL_NAME == "llama3.2":
+            questions = response["response"].split('\n\n')
+        elif MODEL_NAME == "llama3.1":
+            questions = response["response"].split('\n')
+        print("Question generation successful")
+        return len(questions), questions
+# Answer Generation Class
+class AnswerGeneration:
+    def __init__(self, context, questions, question_type, conditions,percentage_text=None,progress_bar=None):
+        self.context = context
+        self.questions = questions
+        self.question_type = question_type
+        self.conditions = conditions
+        self.progress_bar = progress_bar
+        self.percentage_text = percentage_text
+    def generate(self):
+        answers = []
+        for i, question in enumerate(self.questions):
+            prompt = f"""
+            Answer the question: {question} using the following context: {self.context}
+            Answer Type: {self.question_type}
+            Conditions: {self.conditions}
+            Directly provide the answer, without any formatting or symbols.
+            """
+            response = ollama.generate(model=MODEL_NAME, prompt=prompt)
+            answer = response["response"].replace('\n', ' ').replace('**', ' ')
+            print(f"Q{i}: Answer generation successful")
+            answers.append(answer)
+            if self.progress_bar:
+                progress =(i+1) /  len(self.questions)
+                self.progress_bar.progress(progress)
+                self.percentage_text.text(f"Progress: {int(progress * 100)}%")
+        return answers
+# Function to Convert Q&A to CSV
+def create_csv(questions, answers, topic):
+    os.makedirs('csv', exist_ok=True)  # Efficient folder creation
+    # Create DataFrame and Save as CSV
+    df = pd.DataFrame({'Question': questions, 'Answer': answers})
+    file_path = f"csv/Synthetic_Dataset_{topic}.csv"
+    df.to_csv(file_path, index=False)
+    print(df.head())
+    return file_path,df

Synthetic_data_generation-main/csv/Synthetic_Dataset_Machine Learning .csv ADDED Viewed

	@@ -0,0 +1,9 @@

+Question,Answer
+What are the intended learning outcomes for session 1-2?,To understand the intended learning outcomes for session 1-2.
+What is the main topic of the course being introduced in this document?,machine learning
+"Which authors wrote the book ""IntroducBon to machine learning"" published by Cambridge University, UK.?",Aurélien G. and Christopher M. Bishop
+"In which year was the edition of ""IntroducBon to machine learning"" published by MIT Press?",2020
+What are the intended learning outcomes for session 3-4?,"To understand what are the intended learning outcomes for session 3-4, we need to look at the table of contents. For sessions 3 and 4, the topics include ""Paradigms of Learning"", ""Linear Regression, KNN"", ""Logistic Regression"", ""Decision Tree Classifiers"", ""Clustering"", and so on.  The intended learning outcomes for these sessions are listed as:  * CLO11: Understand the fundamental concepts of machine learning. * CLO12: Apply machine learning methods to solve complex problems. * CLO13: Analyze and evaluate the performance of machine learning models. * CLO21: Design and implement a machine learning model pipeline architecture. * CLO22: Evaluate the generalizability of a machine learning model. * CLO23: Understand the concept of VC dimension and its applications in machine learning.  These outcomes are based on the course learning objectives (CLOs) listed in the provided documents."
+What is the primary focus of the course being introduced in this document?,The primary focus of the course being introduced is machine learning.
+Which topics are covered in sessions 5-6 and 7-8 respectively?,"Sessions 5-6 cover Linear Regression, KNN and Sessions 7-8 cover Overfitting."
+What is the main theme of the course being introduced in this document?,The main theme of the course is machine learning.

Synthetic_data_generation-main/csv/Synthetic_Dataset_Robust Transformer-based TTS.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Question,Answer
+"What is the main problem that current neural TTS models suffer from, as mentioned in the document?",robustness issue
+How does RobuTrans model the holistic information of the input differently than TransformerTTS?,"RobuTrans models the holistic information of the input differently than TransformerTTS by replacing encoder-decoder attention with a duration-based hard attention mechanism and causal self-attention with a ""pseudo non-causal attention"" mechanism. Additionally, RobuTrans replaces position embedding with a 1-D CNN to model relative position information in a fixed window."
+What role do prosodic features play in RobuTrans for synthesizing expressive speech?,The prosodic feature plays a critical role in RobuTrans for synthesizing expressive speech.
+How does RobuTrans achieve parity MOS with other models like TransformerTTS and Tacotron2?,"RobuTrans achieves parity MOS with other models like TransformerTTS and Tacotron2 by making several modifications to the traditional transformer architecture, including converting input texts to linguistic features, using a duration-based hard attention mechanism in the decoder, replacing causal self-attention with a ""pseudo non-causal attention"" mechanism, and removing position embedding."
+"What is the alternative to position embedding used in RobuTrans, as mentioned in Section 3.4?",A 1-D CNN

Synthetic_data_generation-main/csv/Synthetic_Dataset_Unconstrained Encoder-decoder Attention.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+Question,Answer
+What is the primary issue that the proposed method addresses in the context of text-to-speech (TTS) models?,"The primary issue that the proposed method addresses in the context of text-to-speech (TTS) models is the removal of encoder-decoder attention mechanisms to improve stability, while also introducing a pseudo non-causal attention mechanism to replace causal self-attention layers."
+"The encoder-decoder attention mechanism is a crucial factor for instability, but removing it will also discard its advantages.","The encoder-decoder attention mechanism provides a holistic view of the input sequence for the decoder and composes frame-level context vectors according to decoder inputs. Removing it will discard its advantages but also prevents instability that can arise from certain cases, such as abnormal speech."
+How does the proposed pseudo non-causal attention (PNCA) replace the causal self-attention layers in the decoder?,"The proposed pseudo non-causal attention (PNCA) replaces the causal self-attention layers in the decoder by using a combination of mel frames and encoder hidden states. The left part of PNCA is similar to causal self-attention, which takes encoder hidden states fused with padded mel spectrum frames as input, while the right part consumes only the encoder hidden states."
+"PNCA takes the encoder hidden states fused with padded mel spectrum frames by a linear projection as input, and consumes only the encoder hidden states.",The encoder hidden states fused with padded mel spectrum frames by a linear projection is used as input for PNCA's causal self-attention.
+What is the purpose of using duration predictor in the TTS model?,The purpose of using a duration predictor in the TTS model is to generate correct content by forcing the decoder to copy encoder hidden states to their corresponding frames based on predicted logarithmic duration.
+To predict the logarithmic duration of each encoder hidden state and generate ground truth duration for the model training.,"To predict the logarithmic duration of each encoder hidden state and generate ground truth duration for model training, a duration predictor structure is used with two convolutional layers, normalization, dropout, and a linear projection. Mean squared error is employed as the loss function. Speech recognition tools are used to make forced alignment between audio and phoneme sequence, then predicted duration is used to copy and expand phoneme-level features to frame-level features accordingly."
+How does the phoneme-level features are copied and expanded to frame-level features according to the predicted duration?,"The phoneme-level features are copied and expanded to frame-level features according to the predicted duration by first computing the logarithmic duration of each encoder hidden state using a duration predictor consisting of two convolutional layers and a linear projection. Then, the predicted duration is used to tile the encoded states into time steps. The tiled encoded states are concatenated with the mel spectrum frames processed by Decoder Pre-net, and then through a linear projection. This results in a frame-level context vector that can be used for pseudo non-causal attention."
+The phoneme-level features are copied and expanded to frame-level features by using the predicted duration.,The predicted duration is used to copy and expand phoneme-level features to frame-level features.
+What is the main advantage of PNCA in comparison to causal self-attention layers?,"The main advantage of PNCA over causal self-attention layers is that it provides a holistic view of the input sequence for the decoder while also composing frame-level context vectors according to decoder inputs, making the decoding procedure robust."

Synthetic_data_generation-main/csv/Synthetic_Dataset_encoder-decoder.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Question,Answer
+What are the two aspects that the encoder-decoder attention provides?,holistic view and context vectors
+What is the name of the proposed mechanism to replace causal self-attention layers?,PNCA
+What feature does the pseudo non-causal attention inherit from the original attention mechanism?,Holistic view
+What differs RobuTrans from TransformerTTS in terms of input for Encoder?,Linguistic
+What layer in Decoder is replaced with pseudo non-causal attention in RobuTrans?,self-attention
+What type of features are obtained using a rule-based system in Text-to-Linguistic-Feature Converter?,Features

Synthetic_data_generation-main/faiss_index/index.faiss ADDED Viewed

Binary file (40 kB). View file

Synthetic_data_generation-main/faiss_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1ff93424ab40a8ed54fb21aed868c68ab32444ae2976670c2f47b9fdefe7404
+size 9468

Synthetic_data_generation-main/frontend.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import streamlit as st
+import PyPDF2
+import pandas as pd
+import os
+# Function to extract text from the uploaded PDF
+def extract_pdf_content(file):
+    reader = PyPDF2.PdfReader(file)
+    content = ""
+    for page_num in range(len(reader.pages)):
+        content += reader.pages[page_num].extract_text()
+    return content
+# Function to generate questions and answers using dummy data
+def generate_qa(content, topic, num_questions, answer_type, custom_conditions):
+    questions = []
+    answers = []
+    for i in range(1, num_questions + 1):
+        questions.append(f"Sample question {i} about {topic}")
+        answers.append(f"Sample {answer_type.lower()} answer for question {i}")
+    return questions, answers
+# Function to convert questions and answers into a CSV format and save to server
+def save_to_csv(questions, answers):
+    df = pd.DataFrame({'Questions': questions, 'Answers': answers})
+    # Specify a directory to save the CSV file
+    directory = "saved_files"
+    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
+    file_path = os.path.join(directory, 'questions_answers.csv')  # Full path to save
+    df.to_csv(file_path, index=False)
+    return file_path
+# Streamlit app structure
+def app():
+    # Logo and Title beside each other with slight upward adjustment
+    col1, col2 = st.columns([1, 3])  # Adjust column width ratio
+    with col1:
+        st.image("C://Users//hafee//Downloads//Logo.jpeg", width=150)  # Update with your local logo path
+    with col2:
+        st.markdown(
+            "<h1 style='text-align: left; margin-top: -10px;'>Synthetic Data Generator</h1>",
+            unsafe_allow_html=True
+        )
+    # Slogan below the title
+    st.markdown("<h2 style='text-align: center;'>Your Reliable Synthetic Dataset Generation</h2>", unsafe_allow_html=True)
+    # File Upload for PDF
+    file = st.file_uploader("Drag your Content or Document (.pdf only)", type=['pdf'])
+    # Topic input
+    topic = st.text_input("Topic Name", placeholder="Enter the topic name")
+    # Number of questions input
+    num_questions = st.number_input("Number of Questions", min_value=1, max_value=100, value=5, step=1)
+    # Answer type selection (horizontal)
+    answer_type = st.radio("Answer Type", options=["One-word", "Short", "Long"], index=1, horizontal=True)
+    # Custom conditions input
+    custom_conditions = st.text_area("Custom Conditions", placeholder="Enter any custom rules for the LLM...")
+    # Generate button
+    generate_button = st.button("Generate")
+    if generate_button and file and topic:
+        # Extract content from the uploaded PDF
+        content = extract_pdf_content(file)
+        # Generate questions and answers
+        questions, answers = generate_qa(content, topic, num_questions, answer_type, custom_conditions)
+        # Display the generated questions and answers
+        st.subheader("Generated Questions and Answers")
+        for i, (q, a) in enumerate(zip(questions, answers), start=1):
+            st.write(f"*Q{i}:* {q}")
+            st.write(f"*A{i}:* {a}")
+            st.write("---")
+        # Save to CSV and provide a download link
+        csv_file_path = save_to_csv(questions, answers)
+        # Provide a message to indicate where the file is saved
+        st.success(f"The CSV file has been saved to the server at: {csv_file_path}")
+        # Provide a download link for the CSV file
+        with open(csv_file_path, 'rb') as f:
+            st.download_button(
+                label="Download as CSV",
+                data=f,
+                file_name="questions_answers.csv",
+                mime="text/csv"
+            )
+        # Add a message below the download button
+        st.write("Click the button above to download your CSV file.")
+# Run the Streamlit app
+if _name_ == "_main_":
+    app()

Synthetic_data_generation-main/log/Retrieval_log.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Clarified Query: SELECT * FROM "ComputerScience" WHERE topic = 'Machine Learning'
+Document 1: Session-Wise Topics and Reading/References CodeTextbook/Ar+cle/Report NameEdi+on/Year/LinkCLO Mapped1Alpaydin, E. IntroducBon to machine learning. MIT Press.2020CLO1, CLO2, & CLO32Smola, A., & Vishwanathan, S. V. N. IntroducBon to machine learning. Cambridge University, UK.2008CLO1, CLO2, & CLO3Sn.TopicSession Intended Learning OutcomeCLOReading Material Code1-2IntroducBon3, 3, 3, 2, 2, 1CLO11, 23-4Paradigms of Learning3, 3, 3, 2, 2, 1CLO11, 25-6Linear Regression, KNN3, 3, 3, 2, 2, 1CLO11, 27-8LogisBc Regression3, 3, 3, 2, 2, 1CLO11, 29-10Decision Tree Classiﬁers3, 3, 3, 2, 2, 1CLO11, 211-12Clustering3, 3, 3, 2, 2, 1CLO11, 213-14Gaussian Mixture Model3, 3, 3, 2, 2, 1CLO11, 215-16Training & TesBng2, 3, 3, 3, 2CLO21, 217-19GeneralizaBon2, 3, 3, 3, 2CLO21, 220-22VC Dimension2, 3, 3, 3, 2CLO21, 223-24Bias-Variance2, 3, 3, 3, 2CLO21, 225-26Neural Networks2, 3, 3, 3, 2CLO21, 227-28OverﬁXng2, 3, 3, 3, 2CLO21, 229-30RegularizaBon 2, 3, 3, 3, 2CLO21, 231-32ValidaBon2, 3, 3, 3, 2CLO21,
+Document 2: 3, 2CLO21, 223-24Bias-Variance2, 3, 3, 3, 2CLO21, 225-26Neural Networks2, 3, 3, 3, 2CLO21, 227-28OverﬁXng2, 3, 3, 3, 2CLO21, 229-30RegularizaBon 2, 3, 3, 3, 2CLO21, 231-32ValidaBon2, 3, 3, 3, 2CLO21, 233-34Support Vector Machines2, 3, 3, 3, 2CLO21, 235-36Kernel Methods2, 3, 3, 3, 2CLO21, 237-39IntroducBon to Deep Learning2, 3, 3, 3, 1CLO31, 240-41IntroducBon to Reinforcement Learning2, 3, 3, 3, 1CLO31, 2  4
+Document 3: Prepared by  Faculty NameDr. Sai Charan AddankiEmail IDsaicharan.addanki@woxsen.edu.in Brief Descrip+on and Relevance of the Course This course introduces students to the exciBng ﬁeld of machine learning, providing a comprehensive overview of key concepts, models, and pracBcal applicaBons. Designed for undergraduate students in their ﬁHh semester, this course aims to equip learners with the foundaBonal knowledge and skills necessary to understand, implement, and criBcally evaluate machine learning models. Programme LOs Addressed PLODeﬁni+on1Engineering knowledge: Apply the knowledge of mathemaBcs, science, engineering fundamentals, and an engineering specializaBon to the soluBon of complex engineering problems2Problem analysis: IdenBfy, formulate, review research literature, and analyze complex engineering problems related to Computer Science and Engineering and reaching substanBated conclusions using ﬁrst principles of mathemaBcs, natural sciences, and engineering
+Document 4: change  2 Course LOs Course LOMapping to Programme ILOCLO 1: Understanding of ML methods and their appropriate applicaBon3, 3, 3, 2, 2, 1CLO 2: Knowledge of ML model pipeline architecture2, 3, 3, 3, 2CLO 3: ApplicaBons of ML in advanced topics2, 3, 3, 3, 1

Synthetic_data_generation-main/logo.jpeg ADDED Viewed

Synthetic_data_generation-main/main.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import streamlit as st
+from backend import Database, Context, QuestionGeneration, AnswerGeneration, create_csv
+# Layout for logo and header
+col1, col2 = st.columns([1, 3])
+with col1:
+    st.image("logo.jpeg", width=150)
+with col2:
+    st.markdown(
+        "<h1 style='text-align: left; margin-top: -10px;'>Synthetic Data Generator</h1>",
+        unsafe_allow_html=True
+    )
+    st.markdown("<h2 style='text-align: center;'>Your Reliable Synthetic Dataset Generation</h2>", unsafe_allow_html=True)
+# File Upload Section
+file = st.file_uploader("Choose PDF Files", accept_multiple_files=True, type="pdf")
+topic = st.text_input("Topic Name", placeholder="Enter the topic name")
+num_questions = st.number_input("Number of Questions", min_value=5, max_value=100, value=8)
+answer_type = st.radio("Answer Type", options=["One-word", "Short", "Long"], index=1, horizontal=True)
+custom_conditions = st.text_area("Custom Conditions", placeholder="Enter any custom rules for the LLM...")
+# Generate Button with Input Validation
+if st.button("Generate") and file and topic:
+    # Process PDF Files and Store Chunks in Vector Store
+    with st.spinner("Storing in Database..."):
+        db = Database(file)
+        db.store()
+    # Retrieve Context Based on Topic
+    with st.spinner("Retriving Contexts..."):
+        context_obj = Context(topic)
+        clarified_query = context_obj.redefine()
+        context_content = context_obj.retrieve_faiss(clarified_query)
+    with st.spinner("Generating Questions..."):
+        # Generate Questions
+        question_gen = QuestionGeneration(context=context_content, num_questions=num_questions,
+                                          question_type=answer_type, conditions=custom_conditions)
+        total_questions, questions = question_gen.generate()
+    #Display Total questions generated
+    st.write(f"Total {total_questions} Questions Generated")
+    with st.spinner(f"Generating Answer"):
+        # CSS to enlarge the progress bar
+        st.markdown("""
+                    <style>
+                    .stProgress > div > div > div > div {
+                        background-color: red; /* Change the color to red */
+                        height: 25px; /* Increase this value to make it larger */
+                    }
+                    </style>
+                    """, unsafe_allow_html=True)
+        progress_bar = st.progress(0)
+        percentage_text = st.empty()
+        # Generate Answers
+        answer_gen = AnswerGeneration(context_content, questions, answer_type, custom_conditions,
+                                      percentage_text=percentage_text, progress_bar=progress_bar)
+        answers = answer_gen.generate()
+    st.success("Answers Generated")
+    # Save Questions and Answers to CSV and Provide Download Option
+    csv_file_path,df = create_csv(questions, answers, topic)
+    st.write("Preview of Data")
+    st.dataframe(df)
+    # CSV Download Button
+    with open(csv_file_path, 'rb') as file:
+        st.write("Click the below button to download your CSV file.")
+        st.download_button(label="Download as CSV", data=file, file_name=f"{topic}_questions_answers.csv", mime="text/csv")

Synthetic_data_generation-main/requirements.txt ADDED Viewed

	@@ -0,0 +1,96 @@

+aiohappyeyeballs==2.4.0
+aiohttp==3.10.6
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.0
+async-timeout==4.0.3
+attrs==24.2.0
+beautifulsoup4==4.12.3
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses-json==0.6.7
+exceptiongroup==1.2.2
+faiss-cpu==1.9.0
+frozenlist==1.4.1
+gitdb==4.0.11
+GitPython==3.1.43
+google==3.0.0
+google-ai-generativelanguage==0.6.6
+google-api-core==2.20.0
+google-api-python-client==2.147.0
+google-auth==2.35.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.65.0
+grpcio==1.66.1
+grpcio-status==1.62.3
+h11==0.14.0
+httpcore==1.0.5
+httplib2==0.22.0
+httpx==0.27.2
+idna==3.10
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+langchain==0.3.0
+langchain-community==0.3.0
+langchain-core==0.3.5
+langchain-google-genai==2.0.0
+langchain-text-splitters==0.3.0
+langsmith==0.1.128
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.22.0
+mdurl==0.1.2
+multidict==6.1.0
+mypy-extensions==1.0.0
+narwhals==1.13.1
+numpy==1.26.4
+ollama==0.3.3
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.3
+pillow==10.4.0
+proto-plus==1.24.0
+protobuf==4.25.5
+pyarrow==18.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pydantic==2.9.2
+pydantic-settings==2.5.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+Pygments==2.18.0
+pyparsing==3.1.4
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.20.1
+rsa==4.9
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.35
+streamlit==1.39.0
+tenacity==8.5.0
+toml==0.10.2
+tornado==6.4.1
+tqdm==4.66.5
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uritemplate==4.1.1
+urllib3==2.2.3
+yarl==1.12.1