HafeezBavikadi commited on
Commit
9ef8237
·
verified ·
1 Parent(s): a1d5387

Upload 25 files

Browse files
Files changed (25) hide show
  1. Synthetic_data_generation-main/.DS_Store +0 -0
  2. Synthetic_data_generation-main/.env +1 -0
  3. Synthetic_data_generation-main/.idea/.gitignore +8 -0
  4. Synthetic_data_generation-main/.idea/Synthetic_data_generation.iml +10 -0
  5. Synthetic_data_generation-main/.idea/inspectionProfiles/profiles_settings.xml +6 -0
  6. Synthetic_data_generation-main/.idea/misc.xml +7 -0
  7. Synthetic_data_generation-main/.idea/modules.xml +8 -0
  8. Synthetic_data_generation-main/.idea/vcs.xml +6 -0
  9. Synthetic_data_generation-main/Dockerfile +14 -0
  10. Synthetic_data_generation-main/README.md +1 -0
  11. Synthetic_data_generation-main/Retrieval_log.txt +29 -0
  12. Synthetic_data_generation-main/Retrival_log.txt +25 -0
  13. Synthetic_data_generation-main/Test.ipynb +99 -0
  14. Synthetic_data_generation-main/backend.py +149 -0
  15. Synthetic_data_generation-main/csv/Synthetic_Dataset_Machine Learning .csv +9 -0
  16. Synthetic_data_generation-main/csv/Synthetic_Dataset_Robust Transformer-based TTS.csv +6 -0
  17. Synthetic_data_generation-main/csv/Synthetic_Dataset_Unconstrained Encoder-decoder Attention.csv +10 -0
  18. Synthetic_data_generation-main/csv/Synthetic_Dataset_encoder-decoder.csv +7 -0
  19. Synthetic_data_generation-main/faiss_index/index.faiss +0 -0
  20. Synthetic_data_generation-main/faiss_index/index.pkl +3 -0
  21. Synthetic_data_generation-main/frontend.py +102 -0
  22. Synthetic_data_generation-main/log/Retrieval_log.txt +5 -0
  23. Synthetic_data_generation-main/logo.jpeg +0 -0
  24. Synthetic_data_generation-main/main.py +77 -0
  25. Synthetic_data_generation-main/requirements.txt +96 -0
Synthetic_data_generation-main/.DS_Store ADDED
Binary file (6.15 kB). View file
 
Synthetic_data_generation-main/.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GOOGLE_API_KEY = "AIzaSyBzoR2i4jQgQBRCsP0QMOct9sXqVEAXACA"
Synthetic_data_generation-main/.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
Synthetic_data_generation-main/.idea/Synthetic_data_generation.iml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/.venv" />
6
+ </content>
7
+ <orderEntry type="inheritedJdk" />
8
+ <orderEntry type="sourceFolder" forTests="false" />
9
+ </component>
10
+ </module>
Synthetic_data_generation-main/.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
Synthetic_data_generation-main/.idea/misc.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.9 (Synthetic_data_generation)" />
5
+ </component>
6
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (Synthetic_data_generation)" project-jdk-type="Python SDK" />
7
+ </project>
Synthetic_data_generation-main/.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/Synthetic_data_generation.iml" filepath="$PROJECT_DIR$/.idea/Synthetic_data_generation.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
Synthetic_data_generation-main/.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
Synthetic_data_generation-main/Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt /app
6
+ COPY main.py /app
7
+ COPY logo.jpeg /app
8
+ COPY backend.py /app
9
+ COPY .env /app
10
+
11
+ RUN pip install -r requirements.txt
12
+
13
+ ENTRYPOINT ["streamlit","run " ,"main.py"]
14
+
Synthetic_data_generation-main/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Synthetic_data_generation
Synthetic_data_generation-main/Retrieval_log.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Clarified Query: SELECT * FROM "VectorDatabase" WHERE `topic` = "encoder-decoder";
2
+ Document 1: rid of abnormal cases, instead gives rise to other issues suchas higher speech rate and weird rhythm.
3
+ In this paper, we remove the encoder-decoder attention
4
+ and apply a duration-based hard attention to copy encoderhidden states to their corresponding frames, forcing the de-
5
+ 1Such as URL, a sequence of numbers, and other texts which
6
+ are out of the domain of the training data
7
+ 8228coder to generate correct content. To have a holistic view
8
+ of the whole input as the original attention mechanism, we
9
+ replace the causal self-attention layer in the decoder with a
10
+ Document 2: mation is injected by adding two position embeddings to theoutput of the encoder and decoder pre-nets respectively. Theencoder is built with stacks of several identity blocks, eachcontains two sub-networks: a self-attention and a feed for-ward network. The decoder has the similar structure, whilethe self-attention is causal to attend to only the previouslydecoded frames, and an extra encoder-decoder attention isleveraged to attend to encoder hidden states.
11
+ Based on the final hidden states of the decoder, mel spec-
12
+ trum frames are generated autogressively with a linear layer
13
+ Figure 1: Architecture of TransfomerTTS.
14
+ followed by a post-net, which stops when a stop token is
15
+ predicted by a separate linear projection.
16
+ Similar to Tacotron2, TransformerTTS also borrows tech-
17
+ Document 3: 3.3 Pseudo Non-causal Attention
18
+ As discussed in Section 2.1, the encoder-decoder attentionmechanism is a crucial factor for the instability. However,simply removing this attention will also discard the advan-tages it brings to the TTS model. The advantages can beconsidered as the following two aspects. On the one hand,the encoder-decoder attention provides a holistic view of in-put sequence for the decoder, while on the other hand, itcomposes frame-level context vectors according to decoderinputs (which are mel frames). These two advantages makegreat contribution to the decoding procedure, and we pro-pose ”pseudo non-causal attention” (PNCA) to replace thecausal self-attention layers as shown in Figure 4, which notonly inherits the two features above, but also makes the de-coding procedure robust.
19
+ LetTbe the total length of mel spectrum to be decoded,
20
+ x
21
+ l
22
+ ibe the autoregressive output of step iand layer l,hibe
23
+ the tiled encoder hidden state of step i. For the time step
24
+ Document 4: RobuTrans differs from TransformerTTS in following as-
25
+ pects: 1) The input of Encoder is linguistic features, whichconsists of phonemic and prosodic features; 2) The positionembedding in the Encoder and Decoder is removed; 3) Theencoder-decoder attention is replaced with a duration basedhard attention; 4) The causal self-attention in Decoder is re-placed with pseudo non-causal attention.
26
+ 82303.1 Text-to-Linguistic-Feature Converter
27
+ We first convert the input text into linguistic features, which
28
+ consist of phonemic and prosodic features and then con-sumed by Encoder. To obtain the phonemic features, a rule-based system is used for the grapheme-to-phoneme conver-sion, which generates the phonemic categorical features
29
+ 2.
Synthetic_data_generation-main/Retrival_log.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Clarified Query:Here is the query:
2
+
3
+ `"encoder-decoder attention mechanism without position encoding constraint"`Document 1:the third word at the third step, while on the other hand, af-
4
+ ter attending to the forth word at the fourth step, it attendsto the third word again. These two cases definitely outputbad cases. Although some variation of attention mechanisms(e.g. forward attention) has tried to construct a monotonouscontinuous correspondence between encoder and decoder,they cannot completely eliminate bad cases. Details will beshown in Section 4.6.
5
+ 2.2 Imprecise Stop PredictionDocument 2:thei-th step attends to the j-th word at the source side,
6
+ the(i+1 ) -th step must attend to the (j+n)-th word
7
+ (1≥n≥0), as shown in the left picture in Figure 2.
8
+ Previous models ignore this constraint, and learn the
9
+ alignment from the data totally, resulting in incorrect align-ments for special inputs. The right picture in Figure 2 showsan example of an abnormal alignment. On the one hand, theattention mechanism skips the second word and attends to
10
+ 8229Figure 2: Normal and abnormal alignments of encoder-
11
+ decoder attention. Mel spectrum frames (queries) are rangedhorizontally, while encoder hidden states (keys) are vertical.Left: normal alignment; the focus along keys are continuousand monotonous. Right: Abnormal alignment; the red linerepresents the skipping as well as retreating advance.
12
+ the third word at the third step, while on the other hand, af-Document 3:3.3 Pseudo Non-causal Attention
13
+ As discussed in Section 2.1, the encoder-decoder attentionmechanism is a crucial factor for the instability. However,simply removing this attention will also discard the advan-tages it brings to the TTS model. The advantages can beconsidered as the following two aspects. On the one hand,the encoder-decoder attention provides a holistic view of in-put sequence for the decoder, while on the other hand, itcomposes frame-level context vectors according to decoderinputs (which are mel frames). These two advantages makegreat contribution to the decoding procedure, and we pro-pose ”pseudo non-causal attention” (PNCA) to replace thecausal self-attention layers as shown in Figure 4, which notonly inherits the two features above, but also makes the de-coding procedure robust.
14
+ LetTbe the total length of mel spectrum to be decoded,
15
+ x
16
+ l
17
+ ibe the autoregressive output of step iand layer l,hibe
18
+ the tiled encoder hidden state of step i. For the time stepDocument 4:rid of abnormal cases, instead gives rise to other issues suchas higher speech rate and weird rhythm.
19
+ In this paper, we remove the encoder-decoder attention
20
+ and apply a duration-based hard attention to copy encoderhidden states to their corresponding frames, forcing the de-
21
+ 1Such as URL, a sequence of numbers, and other texts which
22
+ are out of the domain of the training data
23
+ 8228coder to generate correct content. To have a holistic view
24
+ of the whole input as the original attention mechanism, we
25
+ replace the causal self-attention layer in the decoder with a
Synthetic_data_generation-main/Test.ipynb ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "initial_id",
7
+ "metadata": {
8
+ "collapsed": true
9
+ },
10
+ "outputs": [],
11
+ "source": "import "
12
+ },
13
+ {
14
+ "metadata": {},
15
+ "cell_type": "code",
16
+ "outputs": [],
17
+ "execution_count": null,
18
+ "source": "",
19
+ "id": "ff1c18b395292c03"
20
+ },
21
+ {
22
+ "metadata": {},
23
+ "cell_type": "code",
24
+ "outputs": [],
25
+ "execution_count": null,
26
+ "source": "",
27
+ "id": "bc53fc83faa7ed78"
28
+ },
29
+ {
30
+ "metadata": {},
31
+ "cell_type": "code",
32
+ "outputs": [],
33
+ "execution_count": null,
34
+ "source": "",
35
+ "id": "5b0aa1cee4c0030e"
36
+ },
37
+ {
38
+ "metadata": {},
39
+ "cell_type": "code",
40
+ "outputs": [],
41
+ "execution_count": null,
42
+ "source": "",
43
+ "id": "e3cacb52b5b83314"
44
+ },
45
+ {
46
+ "metadata": {},
47
+ "cell_type": "code",
48
+ "outputs": [],
49
+ "execution_count": null,
50
+ "source": "",
51
+ "id": "7821460893af6a94"
52
+ },
53
+ {
54
+ "metadata": {},
55
+ "cell_type": "code",
56
+ "outputs": [],
57
+ "execution_count": null,
58
+ "source": "",
59
+ "id": "c2ed44c77bd16a1d"
60
+ },
61
+ {
62
+ "metadata": {},
63
+ "cell_type": "code",
64
+ "outputs": [],
65
+ "execution_count": null,
66
+ "source": "",
67
+ "id": "e6abdae43b6d3e6b"
68
+ },
69
+ {
70
+ "metadata": {},
71
+ "cell_type": "code",
72
+ "outputs": [],
73
+ "execution_count": null,
74
+ "source": "",
75
+ "id": "f09de4d352e263e3"
76
+ }
77
+ ],
78
+ "metadata": {
79
+ "kernelspec": {
80
+ "display_name": "Python 3",
81
+ "language": "python",
82
+ "name": "python3"
83
+ },
84
+ "language_info": {
85
+ "codemirror_mode": {
86
+ "name": "ipython",
87
+ "version": 2
88
+ },
89
+ "file_extension": ".py",
90
+ "mimetype": "text/x-python",
91
+ "name": "python",
92
+ "nbconvert_exporter": "python",
93
+ "pygments_lexer": "ipython2",
94
+ "version": "2.7.6"
95
+ }
96
+ },
97
+ "nbformat": 4,
98
+ "nbformat_minor": 5
99
+ }
Synthetic_data_generation-main/backend.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import ollama
4
+ from PyPDF2 import PdfReader
5
+ from google import generativeai as genai
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
9
+ import pandas as pd
10
+ from streamlit import progress
11
+
12
+ # Load Environment and Set API Key
13
+ load_dotenv()
14
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
15
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
16
+
17
+ # Specify model
18
+ MODEL_NAME = "llama3.1"
19
+
20
+
21
+ # PDF Processing Class
22
+ class Database:
23
+ def __init__(self, pdf_docs):
24
+ self.pdf_docs = pdf_docs
25
+
26
+ def _pdf_to_text(self):
27
+ # Efficiently extract text from all pages in all PDF files
28
+ self.text = "".join(
29
+ page.extract_text()
30
+ for pdf in self.pdf_docs
31
+ for page in PdfReader(pdf).pages
32
+ )
33
+
34
+ def _text_to_chunks(self):
35
+ # Split text into manageable chunks
36
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
37
+ self.chunks = text_splitter.split_text(self.text)
38
+
39
+ def _vectorstore(self):
40
+ # Save vectorized chunks for later retrieval
41
+ vectorstore = FAISS.from_texts(self.chunks, embeddings)
42
+ vectorstore.save_local("faiss_index")
43
+ print("Vector embeddings saved")
44
+
45
+ def store(self):
46
+ self._pdf_to_text()
47
+ self._text_to_chunks()
48
+ self._vectorstore()
49
+
50
+
51
+ # Context Retrieval Class
52
+ class Context:
53
+ def __init__(self, topic):
54
+ self.topic = topic
55
+
56
+ def redefine(self):
57
+ prompt_redefine = f"""
58
+ You are an assistant creating queries for vector database retrieval based on topics. Given the Topic: '{self.topic}',
59
+ return only the clarified query.
60
+ """
61
+ redefined_response = ollama.generate(model=MODEL_NAME, prompt=prompt_redefine)
62
+ self.clarified_query = redefined_response["response"]
63
+ return self.clarified_query
64
+
65
+ def retrieve_faiss(self, query):
66
+ new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
67
+ docs = new_db.similarity_search(query)
68
+ pdf_docs = [doc.page_content for doc in docs]
69
+
70
+ os.makedirs('log', exist_ok=True)
71
+ with open("log/Retrieval_log.txt", "w") as file:
72
+ file.write(f"Clarified Query: {query}\n")
73
+ for i, pdf_doc in enumerate(pdf_docs, start=1):
74
+ file.write(f"Document {i}: {pdf_doc}\n")
75
+ return docs
76
+
77
+
78
+ # Question Generation Class
79
+ class QuestionGeneration:
80
+ def __init__(self, context, num_questions, question_type, conditions):
81
+ self.context = context
82
+ self.num_questions = num_questions
83
+ self.question_type = question_type
84
+ self.conditions = conditions
85
+
86
+ def generate(self):
87
+ prompt = f"""
88
+ Generate {self.num_questions} questions based on the context provided.
89
+
90
+ Context: {self.context}
91
+ Total Questions: {self.num_questions}
92
+ Question Type: {self.question_type}
93
+ Conditions: {self.conditions}
94
+
95
+ Provide the questions without any numbering or introduction.
96
+ """
97
+ response = ollama.generate(model=MODEL_NAME, prompt=prompt)
98
+ if MODEL_NAME == "llama3.2":
99
+ questions = response["response"].split('\n\n')
100
+ elif MODEL_NAME == "llama3.1":
101
+ questions = response["response"].split('\n')
102
+
103
+ print("Question generation successful")
104
+ return len(questions), questions
105
+
106
+
107
+ # Answer Generation Class
108
+ class AnswerGeneration:
109
+ def __init__(self, context, questions, question_type, conditions,percentage_text=None,progress_bar=None):
110
+ self.context = context
111
+ self.questions = questions
112
+ self.question_type = question_type
113
+ self.conditions = conditions
114
+ self.progress_bar = progress_bar
115
+ self.percentage_text = percentage_text
116
+
117
+ def generate(self):
118
+ answers = []
119
+ for i, question in enumerate(self.questions):
120
+ prompt = f"""
121
+ Answer the question: {question} using the following context: {self.context}
122
+
123
+ Answer Type: {self.question_type}
124
+ Conditions: {self.conditions}
125
+
126
+ Directly provide the answer, without any formatting or symbols.
127
+ """
128
+ response = ollama.generate(model=MODEL_NAME, prompt=prompt)
129
+ answer = response["response"].replace('\n', ' ').replace('**', ' ')
130
+ print(f"Q{i}: Answer generation successful")
131
+ answers.append(answer)
132
+ if self.progress_bar:
133
+ progress =(i+1) / len(self.questions)
134
+ self.progress_bar.progress(progress)
135
+ self.percentage_text.text(f"Progress: {int(progress * 100)}%")
136
+ return answers
137
+
138
+
139
+ # Function to Convert Q&A to CSV
140
+ def create_csv(questions, answers, topic):
141
+ os.makedirs('csv', exist_ok=True) # Efficient folder creation
142
+
143
+ # Create DataFrame and Save as CSV
144
+ df = pd.DataFrame({'Question': questions, 'Answer': answers})
145
+ file_path = f"csv/Synthetic_Dataset_{topic}.csv"
146
+ df.to_csv(file_path, index=False)
147
+
148
+ print(df.head())
149
+ return file_path,df
Synthetic_data_generation-main/csv/Synthetic_Dataset_Machine Learning .csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Question,Answer
2
+ What are the intended learning outcomes for session 1-2?,To understand the intended learning outcomes for session 1-2.
3
+ What is the main topic of the course being introduced in this document?,machine learning
4
+ "Which authors wrote the book ""IntroducBon to machine learning"" published by Cambridge University, UK.?",Aurélien G. and Christopher M. Bishop
5
+ "In which year was the edition of ""IntroducBon to machine learning"" published by MIT Press?",2020
6
+ What are the intended learning outcomes for session 3-4?,"To understand what are the intended learning outcomes for session 3-4, we need to look at the table of contents. For sessions 3 and 4, the topics include ""Paradigms of Learning"", ""Linear Regression, KNN"", ""Logistic Regression"", ""Decision Tree Classifiers"", ""Clustering"", and so on. The intended learning outcomes for these sessions are listed as: * CLO11: Understand the fundamental concepts of machine learning. * CLO12: Apply machine learning methods to solve complex problems. * CLO13: Analyze and evaluate the performance of machine learning models. * CLO21: Design and implement a machine learning model pipeline architecture. * CLO22: Evaluate the generalizability of a machine learning model. * CLO23: Understand the concept of VC dimension and its applications in machine learning. These outcomes are based on the course learning objectives (CLOs) listed in the provided documents."
7
+ What is the primary focus of the course being introduced in this document?,The primary focus of the course being introduced is machine learning.
8
+ Which topics are covered in sessions 5-6 and 7-8 respectively?,"Sessions 5-6 cover Linear Regression, KNN and Sessions 7-8 cover Overfitting."
9
+ What is the main theme of the course being introduced in this document?,The main theme of the course is machine learning.
Synthetic_data_generation-main/csv/Synthetic_Dataset_Robust Transformer-based TTS.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Question,Answer
2
+ "What is the main problem that current neural TTS models suffer from, as mentioned in the document?",robustness issue
3
+ How does RobuTrans model the holistic information of the input differently than TransformerTTS?,"RobuTrans models the holistic information of the input differently than TransformerTTS by replacing encoder-decoder attention with a duration-based hard attention mechanism and causal self-attention with a ""pseudo non-causal attention"" mechanism. Additionally, RobuTrans replaces position embedding with a 1-D CNN to model relative position information in a fixed window."
4
+ What role do prosodic features play in RobuTrans for synthesizing expressive speech?,The prosodic feature plays a critical role in RobuTrans for synthesizing expressive speech.
5
+ How does RobuTrans achieve parity MOS with other models like TransformerTTS and Tacotron2?,"RobuTrans achieves parity MOS with other models like TransformerTTS and Tacotron2 by making several modifications to the traditional transformer architecture, including converting input texts to linguistic features, using a duration-based hard attention mechanism in the decoder, replacing causal self-attention with a ""pseudo non-causal attention"" mechanism, and removing position embedding."
6
+ "What is the alternative to position embedding used in RobuTrans, as mentioned in Section 3.4?",A 1-D CNN
Synthetic_data_generation-main/csv/Synthetic_Dataset_Unconstrained Encoder-decoder Attention.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,Answer
2
+ What is the primary issue that the proposed method addresses in the context of text-to-speech (TTS) models?,"The primary issue that the proposed method addresses in the context of text-to-speech (TTS) models is the removal of encoder-decoder attention mechanisms to improve stability, while also introducing a pseudo non-causal attention mechanism to replace causal self-attention layers."
3
+ "The encoder-decoder attention mechanism is a crucial factor for instability, but removing it will also discard its advantages.","The encoder-decoder attention mechanism provides a holistic view of the input sequence for the decoder and composes frame-level context vectors according to decoder inputs. Removing it will discard its advantages but also prevents instability that can arise from certain cases, such as abnormal speech."
4
+ How does the proposed pseudo non-causal attention (PNCA) replace the causal self-attention layers in the decoder?,"The proposed pseudo non-causal attention (PNCA) replaces the causal self-attention layers in the decoder by using a combination of mel frames and encoder hidden states. The left part of PNCA is similar to causal self-attention, which takes encoder hidden states fused with padded mel spectrum frames as input, while the right part consumes only the encoder hidden states."
5
+ "PNCA takes the encoder hidden states fused with padded mel spectrum frames by a linear projection as input, and consumes only the encoder hidden states.",The encoder hidden states fused with padded mel spectrum frames by a linear projection is used as input for PNCA's causal self-attention.
6
+ What is the purpose of using duration predictor in the TTS model?,The purpose of using a duration predictor in the TTS model is to generate correct content by forcing the decoder to copy encoder hidden states to their corresponding frames based on predicted logarithmic duration.
7
+ To predict the logarithmic duration of each encoder hidden state and generate ground truth duration for the model training.,"To predict the logarithmic duration of each encoder hidden state and generate ground truth duration for model training, a duration predictor structure is used with two convolutional layers, normalization, dropout, and a linear projection. Mean squared error is employed as the loss function. Speech recognition tools are used to make forced alignment between audio and phoneme sequence, then predicted duration is used to copy and expand phoneme-level features to frame-level features accordingly."
8
+ How does the phoneme-level features are copied and expanded to frame-level features according to the predicted duration?,"The phoneme-level features are copied and expanded to frame-level features according to the predicted duration by first computing the logarithmic duration of each encoder hidden state using a duration predictor consisting of two convolutional layers and a linear projection. Then, the predicted duration is used to tile the encoded states into time steps. The tiled encoded states are concatenated with the mel spectrum frames processed by Decoder Pre-net, and then through a linear projection. This results in a frame-level context vector that can be used for pseudo non-causal attention."
9
+ The phoneme-level features are copied and expanded to frame-level features by using the predicted duration.,The predicted duration is used to copy and expand phoneme-level features to frame-level features.
10
+ What is the main advantage of PNCA in comparison to causal self-attention layers?,"The main advantage of PNCA over causal self-attention layers is that it provides a holistic view of the input sequence for the decoder while also composing frame-level context vectors according to decoder inputs, making the decoding procedure robust."
Synthetic_data_generation-main/csv/Synthetic_Dataset_encoder-decoder.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Question,Answer
2
+ What are the two aspects that the encoder-decoder attention provides?,holistic view and context vectors
3
+ What is the name of the proposed mechanism to replace causal self-attention layers?,PNCA
4
+ What feature does the pseudo non-causal attention inherit from the original attention mechanism?,Holistic view
5
+ What differs RobuTrans from TransformerTTS in terms of input for Encoder?,Linguistic
6
+ What layer in Decoder is replaced with pseudo non-causal attention in RobuTrans?,self-attention
7
+ What type of features are obtained using a rule-based system in Text-to-Linguistic-Feature Converter?,Features
Synthetic_data_generation-main/faiss_index/index.faiss ADDED
Binary file (40 kB). View file
 
Synthetic_data_generation-main/faiss_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1ff93424ab40a8ed54fb21aed868c68ab32444ae2976670c2f47b9fdefe7404
3
+ size 9468
Synthetic_data_generation-main/frontend.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import pandas as pd
4
+ import os
5
+
6
+ # Function to extract text from the uploaded PDF
7
+ def extract_pdf_content(file):
8
+ reader = PyPDF2.PdfReader(file)
9
+ content = ""
10
+ for page_num in range(len(reader.pages)):
11
+ content += reader.pages[page_num].extract_text()
12
+ return content
13
+
14
+ # Function to generate questions and answers using dummy data
15
+ def generate_qa(content, topic, num_questions, answer_type, custom_conditions):
16
+ questions = []
17
+ answers = []
18
+ for i in range(1, num_questions + 1):
19
+ questions.append(f"Sample question {i} about {topic}")
20
+ answers.append(f"Sample {answer_type.lower()} answer for question {i}")
21
+ return questions, answers
22
+
23
+ # Function to convert questions and answers into a CSV format and save to server
24
+ def save_to_csv(questions, answers):
25
+ df = pd.DataFrame({'Questions': questions, 'Answers': answers})
26
+ # Specify a directory to save the CSV file
27
+ directory = "saved_files"
28
+ os.makedirs(directory, exist_ok=True) # Create directory if it doesn't exist
29
+ file_path = os.path.join(directory, 'questions_answers.csv') # Full path to save
30
+ df.to_csv(file_path, index=False)
31
+ return file_path
32
+
33
+ # Streamlit app structure
34
+ def app():
35
+ # Logo and Title beside each other with slight upward adjustment
36
+ col1, col2 = st.columns([1, 3]) # Adjust column width ratio
37
+
38
+ with col1:
39
+ st.image("C://Users//hafee//Downloads//Logo.jpeg", width=150) # Update with your local logo path
40
+
41
+ with col2:
42
+ st.markdown(
43
+ "<h1 style='text-align: left; margin-top: -10px;'>Synthetic Data Generator</h1>",
44
+ unsafe_allow_html=True
45
+ )
46
+
47
+ # Slogan below the title
48
+ st.markdown("<h2 style='text-align: center;'>Your Reliable Synthetic Dataset Generation</h2>", unsafe_allow_html=True)
49
+
50
+ # File Upload for PDF
51
+ file = st.file_uploader("Drag your Content or Document (.pdf only)", type=['pdf'])
52
+
53
+ # Topic input
54
+ topic = st.text_input("Topic Name", placeholder="Enter the topic name")
55
+
56
+ # Number of questions input
57
+ num_questions = st.number_input("Number of Questions", min_value=1, max_value=100, value=5, step=1)
58
+
59
+ # Answer type selection (horizontal)
60
+ answer_type = st.radio("Answer Type", options=["One-word", "Short", "Long"], index=1, horizontal=True)
61
+
62
+ # Custom conditions input
63
+ custom_conditions = st.text_area("Custom Conditions", placeholder="Enter any custom rules for the LLM...")
64
+
65
+ # Generate button
66
+ generate_button = st.button("Generate")
67
+
68
+ if generate_button and file and topic:
69
+ # Extract content from the uploaded PDF
70
+ content = extract_pdf_content(file)
71
+
72
+ # Generate questions and answers
73
+ questions, answers = generate_qa(content, topic, num_questions, answer_type, custom_conditions)
74
+
75
+ # Display the generated questions and answers
76
+ st.subheader("Generated Questions and Answers")
77
+ for i, (q, a) in enumerate(zip(questions, answers), start=1):
78
+ st.write(f"*Q{i}:* {q}")
79
+ st.write(f"*A{i}:* {a}")
80
+ st.write("---")
81
+
82
+ # Save to CSV and provide a download link
83
+ csv_file_path = save_to_csv(questions, answers)
84
+
85
+ # Provide a message to indicate where the file is saved
86
+ st.success(f"The CSV file has been saved to the server at: {csv_file_path}")
87
+
88
+ # Provide a download link for the CSV file
89
+ with open(csv_file_path, 'rb') as f:
90
+ st.download_button(
91
+ label="Download as CSV",
92
+ data=f,
93
+ file_name="questions_answers.csv",
94
+ mime="text/csv"
95
+ )
96
+
97
+ # Add a message below the download button
98
+ st.write("Click the button above to download your CSV file.")
99
+
100
+ # Run the Streamlit app
101
+ if _name_ == "_main_":
102
+ app()
Synthetic_data_generation-main/log/Retrieval_log.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Clarified Query: SELECT * FROM "ComputerScience" WHERE topic = 'Machine Learning'
2
+ Document 1: Session-Wise Topics and Reading/References CodeTextbook/Ar+cle/Report NameEdi+on/Year/LinkCLO Mapped1Alpaydin, E. IntroducBon to machine learning. MIT Press.2020CLO1, CLO2, & CLO32Smola, A., & Vishwanathan, S. V. N. IntroducBon to machine learning. Cambridge University, UK.2008CLO1, CLO2, & CLO3Sn.TopicSession Intended Learning OutcomeCLOReading Material Code1-2IntroducBon3, 3, 3, 2, 2, 1CLO11, 23-4Paradigms of Learning3, 3, 3, 2, 2, 1CLO11, 25-6Linear Regression, KNN3, 3, 3, 2, 2, 1CLO11, 27-8LogisBc Regression3, 3, 3, 2, 2, 1CLO11, 29-10Decision Tree Classifiers3, 3, 3, 2, 2, 1CLO11, 211-12Clustering3, 3, 3, 2, 2, 1CLO11, 213-14Gaussian Mixture Model3, 3, 3, 2, 2, 1CLO11, 215-16Training & TesBng2, 3, 3, 3, 2CLO21, 217-19GeneralizaBon2, 3, 3, 3, 2CLO21, 220-22VC Dimension2, 3, 3, 3, 2CLO21, 223-24Bias-Variance2, 3, 3, 3, 2CLO21, 225-26Neural Networks2, 3, 3, 3, 2CLO21, 227-28OverfiXng2, 3, 3, 3, 2CLO21, 229-30RegularizaBon 2, 3, 3, 3, 2CLO21, 231-32ValidaBon2, 3, 3, 3, 2CLO21,
3
+ Document 2: 3, 2CLO21, 223-24Bias-Variance2, 3, 3, 3, 2CLO21, 225-26Neural Networks2, 3, 3, 3, 2CLO21, 227-28OverfiXng2, 3, 3, 3, 2CLO21, 229-30RegularizaBon 2, 3, 3, 3, 2CLO21, 231-32ValidaBon2, 3, 3, 3, 2CLO21, 233-34Support Vector Machines2, 3, 3, 3, 2CLO21, 235-36Kernel Methods2, 3, 3, 3, 2CLO21, 237-39IntroducBon to Deep Learning2, 3, 3, 3, 1CLO31, 240-41IntroducBon to Reinforcement Learning2, 3, 3, 3, 1CLO31, 2 4
4
+ Document 3: Prepared by Faculty NameDr. Sai Charan AddankiEmail IDsaicharan.addanki@woxsen.edu.in Brief Descrip+on and Relevance of the Course This course introduces students to the exciBng field of machine learning, providing a comprehensive overview of key concepts, models, and pracBcal applicaBons. Designed for undergraduate students in their fiHh semester, this course aims to equip learners with the foundaBonal knowledge and skills necessary to understand, implement, and criBcally evaluate machine learning models. Programme LOs Addressed PLODefini+on1Engineering knowledge: Apply the knowledge of mathemaBcs, science, engineering fundamentals, and an engineering specializaBon to the soluBon of complex engineering problems2Problem analysis: IdenBfy, formulate, review research literature, and analyze complex engineering problems related to Computer Science and Engineering and reaching substanBated conclusions using first principles of mathemaBcs, natural sciences, and engineering
5
+ Document 4: change 2 Course LOs Course LOMapping to Programme ILOCLO 1: Understanding of ML methods and their appropriate applicaBon3, 3, 3, 2, 2, 1CLO 2: Knowledge of ML model pipeline architecture2, 3, 3, 3, 2CLO 3: ApplicaBons of ML in advanced topics2, 3, 3, 3, 1
Synthetic_data_generation-main/logo.jpeg ADDED
Synthetic_data_generation-main/main.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from backend import Database, Context, QuestionGeneration, AnswerGeneration, create_csv
3
+
4
+
5
+ # Layout for logo and header
6
+ col1, col2 = st.columns([1, 3])
7
+ with col1:
8
+ st.image("logo.jpeg", width=150)
9
+ with col2:
10
+ st.markdown(
11
+ "<h1 style='text-align: left; margin-top: -10px;'>Synthetic Data Generator</h1>",
12
+ unsafe_allow_html=True
13
+ )
14
+ st.markdown("<h2 style='text-align: center;'>Your Reliable Synthetic Dataset Generation</h2>", unsafe_allow_html=True)
15
+
16
+ # File Upload Section
17
+ file = st.file_uploader("Choose PDF Files", accept_multiple_files=True, type="pdf")
18
+ topic = st.text_input("Topic Name", placeholder="Enter the topic name")
19
+ num_questions = st.number_input("Number of Questions", min_value=5, max_value=100, value=8)
20
+ answer_type = st.radio("Answer Type", options=["One-word", "Short", "Long"], index=1, horizontal=True)
21
+ custom_conditions = st.text_area("Custom Conditions", placeholder="Enter any custom rules for the LLM...")
22
+
23
+ # Generate Button with Input Validation
24
+ if st.button("Generate") and file and topic:
25
+ # Process PDF Files and Store Chunks in Vector Store
26
+ with st.spinner("Storing in Database..."):
27
+ db = Database(file)
28
+ db.store()
29
+
30
+ # Retrieve Context Based on Topic
31
+ with st.spinner("Retriving Contexts..."):
32
+ context_obj = Context(topic)
33
+ clarified_query = context_obj.redefine()
34
+ context_content = context_obj.retrieve_faiss(clarified_query)
35
+
36
+ with st.spinner("Generating Questions..."):
37
+ # Generate Questions
38
+ question_gen = QuestionGeneration(context=context_content, num_questions=num_questions,
39
+ question_type=answer_type, conditions=custom_conditions)
40
+ total_questions, questions = question_gen.generate()
41
+
42
+ #Display Total questions generated
43
+ st.write(f"Total {total_questions} Questions Generated")
44
+
45
+ with st.spinner(f"Generating Answer"):
46
+ # CSS to enlarge the progress bar
47
+ st.markdown("""
48
+ <style>
49
+ .stProgress > div > div > div > div {
50
+ background-color: red; /* Change the color to red */
51
+ height: 25px; /* Increase this value to make it larger */
52
+ }
53
+ </style>
54
+ """, unsafe_allow_html=True)
55
+
56
+ progress_bar = st.progress(0)
57
+ percentage_text = st.empty()
58
+
59
+ # Generate Answers
60
+ answer_gen = AnswerGeneration(context_content, questions, answer_type, custom_conditions,
61
+ percentage_text=percentage_text, progress_bar=progress_bar)
62
+ answers = answer_gen.generate()
63
+
64
+ st.success("Answers Generated")
65
+
66
+
67
+ # Save Questions and Answers to CSV and Provide Download Option
68
+ csv_file_path,df = create_csv(questions, answers, topic)
69
+ st.write("Preview of Data")
70
+ st.dataframe(df)
71
+
72
+
73
+ # CSV Download Button
74
+ with open(csv_file_path, 'rb') as file:
75
+ st.write("Click the below button to download your CSV file.")
76
+ st.download_button(label="Download as CSV", data=file, file_name=f"{topic}_questions_answers.csv", mime="text/csv")
77
+
Synthetic_data_generation-main/requirements.txt ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.0
2
+ aiohttp==3.10.6
3
+ aiosignal==1.3.1
4
+ altair==5.4.1
5
+ annotated-types==0.7.0
6
+ anyio==4.6.0
7
+ async-timeout==4.0.3
8
+ attrs==24.2.0
9
+ beautifulsoup4==4.12.3
10
+ blinker==1.8.2
11
+ cachetools==5.5.0
12
+ certifi==2024.8.30
13
+ charset-normalizer==3.3.2
14
+ click==8.1.7
15
+ dataclasses-json==0.6.7
16
+ exceptiongroup==1.2.2
17
+ faiss-cpu==1.9.0
18
+ frozenlist==1.4.1
19
+ gitdb==4.0.11
20
+ GitPython==3.1.43
21
+ google==3.0.0
22
+ google-ai-generativelanguage==0.6.6
23
+ google-api-core==2.20.0
24
+ google-api-python-client==2.147.0
25
+ google-auth==2.35.0
26
+ google-auth-httplib2==0.2.0
27
+ google-generativeai==0.7.2
28
+ googleapis-common-protos==1.65.0
29
+ grpcio==1.66.1
30
+ grpcio-status==1.62.3
31
+ h11==0.14.0
32
+ httpcore==1.0.5
33
+ httplib2==0.22.0
34
+ httpx==0.27.2
35
+ idna==3.10
36
+ Jinja2==3.1.4
37
+ jsonpatch==1.33
38
+ jsonpointer==3.0.0
39
+ jsonschema==4.23.0
40
+ jsonschema-specifications==2024.10.1
41
+ langchain==0.3.0
42
+ langchain-community==0.3.0
43
+ langchain-core==0.3.5
44
+ langchain-google-genai==2.0.0
45
+ langchain-text-splitters==0.3.0
46
+ langsmith==0.1.128
47
+ markdown-it-py==3.0.0
48
+ MarkupSafe==3.0.2
49
+ marshmallow==3.22.0
50
+ mdurl==0.1.2
51
+ multidict==6.1.0
52
+ mypy-extensions==1.0.0
53
+ narwhals==1.13.1
54
+ numpy==1.26.4
55
+ ollama==0.3.3
56
+ orjson==3.10.7
57
+ packaging==24.1
58
+ pandas==2.2.3
59
+ pillow==10.4.0
60
+ proto-plus==1.24.0
61
+ protobuf==4.25.5
62
+ pyarrow==18.0.0
63
+ pyasn1==0.6.1
64
+ pyasn1_modules==0.4.1
65
+ pydantic==2.9.2
66
+ pydantic-settings==2.5.2
67
+ pydantic_core==2.23.4
68
+ pydeck==0.9.1
69
+ Pygments==2.18.0
70
+ pyparsing==3.1.4
71
+ PyPDF2==3.0.1
72
+ python-dateutil==2.9.0.post0
73
+ python-dotenv==1.0.1
74
+ pytz==2024.2
75
+ PyYAML==6.0.2
76
+ referencing==0.35.1
77
+ requests==2.32.3
78
+ rich==13.9.4
79
+ rpds-py==0.20.1
80
+ rsa==4.9
81
+ six==1.16.0
82
+ smmap==5.0.1
83
+ sniffio==1.3.1
84
+ soupsieve==2.6
85
+ SQLAlchemy==2.0.35
86
+ streamlit==1.39.0
87
+ tenacity==8.5.0
88
+ toml==0.10.2
89
+ tornado==6.4.1
90
+ tqdm==4.66.5
91
+ typing-inspect==0.9.0
92
+ typing_extensions==4.12.2
93
+ tzdata==2024.2
94
+ uritemplate==4.1.1
95
+ urllib3==2.2.3
96
+ yarl==1.12.1