techconsptrs commited on
Commit
1802405
·
1 Parent(s): d14c53b

UPDATE: code update

Browse files
app.py CHANGED
@@ -1,314 +1,249 @@
 
1
  from src.pipelines.completePipeline import Pipeline
2
  import gradio as gr
3
  import spaces
4
- import os
5
 
6
- # os.system("apt-get update -y")
7
- # os.system("apt-get upgrade -y")
8
- # os.system("apt install poppler-utils -y")
9
-
10
- chain = None
11
- pipeline = Pipeline()
12
 
13
  @spaces.GPU
14
- def getTextResponse(text: str, inputQuery: str):
 
 
 
 
 
 
 
 
 
 
15
  global chain
16
  if chain is None:
17
- chain = pipeline.plainText(text = text)
18
- else:
19
- pass
20
- response = chain.invoke(
21
- {
22
- "question": inputQuery
23
- }
24
- )
25
  return response
26
 
27
-
28
  @spaces.GPU
29
- def getSearchablePdfResponse(path: str, inputQuery: str):
 
 
 
 
 
 
 
 
 
 
30
  global chain
31
  if chain is None:
32
- chain = pipeline.searchablePdf(path = path)
33
- else:
34
- pass
35
- response = chain.invoke(
36
- {
37
- "question": inputQuery
38
- }
39
- )
40
  return response
41
 
42
  @spaces.GPU
43
- def getScannablePdfResponse(path: str, inputQuery: str):
 
 
 
 
 
 
 
 
 
 
44
  global chain
45
  if chain is None:
46
- chain = pipeline.scannablePdf(path = path)
47
- else:
48
- pass
49
- response = chain.invoke(
50
- {
51
- "question": inputQuery
52
- }
53
- )
54
  return response
55
 
56
- def clearFunction():
 
57
  global chain
58
  chain = None
59
 
 
60
  with gr.Blocks() as textInterface:
61
  with gr.Row():
62
  inputText = gr.Textbox(
63
- label = "Input Text",
64
- placeholder = "Enter you text here"
65
  )
66
  with gr.Row():
67
  question = gr.Textbox(
68
- label = "Question",
69
- placeholder = "Enter your question here"
70
  )
71
  answer = gr.Textbox(
72
- label = "Response",
73
- interactive = False
74
  )
75
  with gr.Row():
76
- submitButton = gr.Button(
77
- value = "Submit",
78
- variant = "primary"
79
- )
80
  clearButton = gr.ClearButton(
81
- components = [inputText, question, answer],
82
- value = "Clear",
83
- variant = "secondary"
84
  )
85
- submitButton.click(
86
- fn = getTextResponse,
87
- inputs = [inputText, question],
88
- outputs = [answer]
89
- )
90
- clearButton.click(
91
- fn = clearFunction
92
- )
93
-
94
 
 
95
  with gr.Blocks() as searchablePdf:
96
  with gr.Row():
97
  inputFile = gr.File(
98
- file_types = [".pdf"],
99
- file_count = "single",
100
- label = "Select PDF"
101
  )
102
  with gr.Row():
103
- question = gr.Textbox(
104
- label = "Question",
105
- placeholder = "Enter your question here"
106
- )
107
- answer = gr.Textbox(
108
- label = "Response",
109
- interactive = False
110
- )
111
  with gr.Row():
112
- submitButton = gr.Button(
113
- value = "Submit",
114
- variant = "primary"
115
- )
116
  clearButton = gr.ClearButton(
117
- components = [inputFile, question, answer],
118
- value = "Clear",
119
- variant = "secondary"
120
  )
121
- submitButton.click(
122
- fn = getSearchablePdfResponse,
123
- inputs = [inputFile, question],
124
- outputs = [answer]
125
- )
126
- clearButton.click(
127
- fn = clearFunction
128
- )
129
-
130
 
 
131
  with gr.Blocks() as scannablePdf:
132
  with gr.Row():
133
- inputFile = gr.File(
134
- file_types = [".pdf"],
135
- file_count = "single",
136
- label = "Select PDF"
137
- )
138
  with gr.Row():
139
- question = gr.Textbox(
140
- label = "Question",
141
- placeholder = "Enter your question here"
142
- )
143
- answer = gr.Textbox(
144
- label = "Response",
145
- interactive = False
146
- )
147
  with gr.Row():
148
- submitButton = gr.Button(
149
- value = "Submit",
150
- variant = "primary"
151
- )
152
  clearButton = gr.ClearButton(
153
- components = [inputFile, question, answer],
154
- value = "Clear",
155
- variant = "secondary"
156
- )
157
- submitButton.click(
158
- fn = getScannablePdfResponse,
159
- inputs = [inputFile, question],
160
- outputs = [answer]
161
- )
162
- clearButton.click(
163
- fn = clearFunction
164
- )
 
 
 
 
 
 
 
 
 
 
 
165
 
 
 
 
 
166
 
167
- def getLinksButtonFn(baseUrl: str):
168
- links = pipeline.webCrawler.getLinks(url = baseUrl)
169
- checkboxes = gr.CheckboxGroup(
170
- choices = links,
171
- label = "Fetched Links",
172
- visible = True
173
- )
174
- row2 = gr.Row(visible = True)
175
- row3 = gr.Row(visible = True)
176
- return (
177
- checkboxes,
178
- row2,
179
- row3
180
- )
181
 
182
- @spaces.GPU
183
- def getWebsiteResponse(links: list[str], inputQuery: str):
 
184
  global chain
185
  if chain is None:
186
- print(links)
187
- chain = pipeline.webCrawl(urls = links)
188
- else:
189
- pass
190
- response = chain.invoke(
191
- {
192
- "question": inputQuery
193
- }
194
- )
195
  return response
196
 
197
- def clearWebsiteResponse():
 
198
  global chain
199
- chain = None
200
- checkboxes = gr.CheckboxGroup(
201
- choices = [],
202
- label = "Fetched Links",
203
- visible = False
204
- )
205
  return checkboxes
206
 
 
207
  with gr.Blocks() as websiteCrawler:
208
  with gr.Row():
209
  inputUrl = gr.Textbox(
210
- label = "Base URL",
211
- placeholder = "Enter the Base URL to fetch other links",
212
- scale = 3
213
- )
214
- getLinksButton = gr.Button(
215
- value = "Get Links",
216
- variant = "primary",
217
- scale = 1
218
- )
219
- checkboxes = gr.CheckboxGroup(
220
- choices = [],
221
- label = "Fetched Links",
222
- )
223
- with gr.Row(visible = False) as row2:
224
- question = gr.Textbox(
225
- label = "Question",
226
- placeholder = "Enter your question here"
227
- )
228
- answer = gr.Textbox(
229
- label = "Response",
230
- interactive = False
231
- )
232
- with gr.Row(visible = False) as row3:
233
- submitButton = gr.Button(
234
- value = "Submit",
235
- variant = "primary"
236
- )
237
  clearButton = gr.ClearButton(
238
- components = [question, answer],
239
- value = "Clear",
240
- variant = "secondary"
241
  )
242
- getLinksButton.click(
243
- fn = getLinksButtonFn,
244
- inputs = [inputUrl],
245
- outputs = [checkboxes, row2, row3]
246
- )
247
- submitButton.click(
248
- fn = getWebsiteResponse,
249
- inputs = [checkboxes, question],
250
- outputs = [answer]
251
- )
252
- clearButton.click(
253
- fn = clearWebsiteResponse,
254
- inputs = None,
255
- outputs = [checkboxes]
256
- )
257
 
258
  @spaces.GPU
259
- def getYoutubeResponse(links: str, inputQuery: str):
 
 
 
 
 
 
 
 
 
 
260
  global chain
261
- links = [link.strip() for link in links.split(",")]
262
  if chain is None:
263
- chain = pipeline.youtubeLinks(urls = links)
264
- else:
265
- pass
266
- response = chain.invoke(
267
- {
268
- "question": inputQuery
269
- }
270
- )
271
  return response
272
 
273
-
274
  with gr.Blocks() as youtubeInterface:
275
  with gr.Row():
276
  inputLinks = gr.Textbox(
277
- label = "Youtube Links",
278
- placeholder = 'Enter comma(,)-separated youtube video links'
279
  )
280
  with gr.Row():
281
- question = gr.Textbox(
282
- label = "Question",
283
- placeholder = "Enter your question here"
284
- )
285
- answer = gr.Textbox(
286
- label = "Response",
287
- interactive = False
288
- )
289
  with gr.Row():
290
- submitButton = gr.Button(
291
- value = "Submit",
292
- variant = "primary"
293
- )
294
  clearButton = gr.ClearButton(
295
- components = [inputLinks, question, answer],
296
- value = "Clear",
297
- variant = "secondary"
298
  )
299
- submitButton.click(
300
- fn = getYoutubeResponse,
301
- inputs = [inputLinks, question],
302
- outputs = [answer]
303
- )
304
- clearButton.click(
305
- fn = clearFunction
306
- )
307
-
308
 
 
309
  application = gr.TabbedInterface(
310
  [textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
311
  ["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
312
  )
313
 
 
314
  application.launch()
 
1
+ # Import necessary libraries and modules
2
  from src.pipelines.completePipeline import Pipeline
3
  import gradio as gr
4
  import spaces
 
5
 
6
+ # Initialize global variables
7
+ chain = None # Holds the current processing chain
8
+ pipeline = Pipeline() # Instantiate the processing pipeline
 
 
 
9
 
10
  @spaces.GPU
11
+ def getTextResponse(text: str, inputQuery: str) -> str:
12
+ """
13
+ Generate a response based on the input text and query.
14
+
15
+ Args:
16
+ text (str): The input text to process.
17
+ inputQuery (str): The question to be answered.
18
+
19
+ Returns:
20
+ str: The response generated from the input text.
21
+ """
22
  global chain
23
  if chain is None:
24
+ chain = pipeline.plainText(text=text) # Create a new processing chain for plain text
25
+ response = chain.invoke({"question": inputQuery}) # Process the query
 
 
 
 
 
 
26
  return response
27
 
 
28
  @spaces.GPU
29
+ def getSearchablePdfResponse(path: str, inputQuery: str) -> str:
30
+ """
31
+ Generate a response based on a searchable PDF and query.
32
+
33
+ Args:
34
+ path (str): Path to the searchable PDF.
35
+ inputQuery (str): The question to be answered.
36
+
37
+ Returns:
38
+ str: The response generated from the searchable PDF.
39
+ """
40
  global chain
41
  if chain is None:
42
+ chain = pipeline.searchablePdf(path=path) # Create a new processing chain for the PDF
43
+ response = chain.invoke({"question": inputQuery})
 
 
 
 
 
 
44
  return response
45
 
46
  @spaces.GPU
47
+ def getScannablePdfResponse(path: str, inputQuery: str) -> str:
48
+ """
49
+ Generate a response based on a scannable PDF and query.
50
+
51
+ Args:
52
+ path (str): Path to the scannable PDF.
53
+ inputQuery (str): The question to be answered.
54
+
55
+ Returns:
56
+ str: The response generated from the scannable PDF.
57
+ """
58
  global chain
59
  if chain is None:
60
+ chain = pipeline.scannablePdf(path=path) # Create a new processing chain for the scannable PDF
61
+ response = chain.invoke({"question": inputQuery})
 
 
 
 
 
 
62
  return response
63
 
64
+ def clearFunction() -> None:
65
+ """Reset the processing chain to prepare for new queries."""
66
  global chain
67
  chain = None
68
 
69
+ # User interface for text input
70
  with gr.Blocks() as textInterface:
71
  with gr.Row():
72
  inputText = gr.Textbox(
73
+ label="Input Text",
74
+ placeholder="Enter your text here"
75
  )
76
  with gr.Row():
77
  question = gr.Textbox(
78
+ label="Question",
79
+ placeholder="Enter your question here"
80
  )
81
  answer = gr.Textbox(
82
+ label="Response",
83
+ interactive=False # Make the response field read-only
84
  )
85
  with gr.Row():
86
+ submitButton = gr.Button(value="Submit", variant="primary")
 
 
 
87
  clearButton = gr.ClearButton(
88
+ components=[inputText, question, answer],
89
+ value="Clear",
90
+ variant="secondary"
91
  )
92
+ # Define actions for buttons
93
+ submitButton.click(fn=getTextResponse, inputs=[inputText, question], outputs=[answer])
94
+ clearButton.click(fn=clearFunction)
 
 
 
 
 
 
95
 
96
+ # User interface for searchable PDF input
97
  with gr.Blocks() as searchablePdf:
98
  with gr.Row():
99
  inputFile = gr.File(
100
+ file_types=[".pdf"], # Restrict file types to PDFs
101
+ file_count="single", # Allow only one PDF file selection
102
+ label="Select PDF"
103
  )
104
  with gr.Row():
105
+ question = gr.Textbox(label="Question", placeholder="Enter your question here")
106
+ answer = gr.Textbox(label="Response", interactive=False)
 
 
 
 
 
 
107
  with gr.Row():
108
+ submitButton = gr.Button(value="Submit", variant="primary")
 
 
 
109
  clearButton = gr.ClearButton(
110
+ components=[inputFile, question, answer],
111
+ value="Clear",
112
+ variant="secondary"
113
  )
114
+ # Define actions for buttons
115
+ submitButton.click(fn=getSearchablePdfResponse, inputs=[inputFile, question], outputs=[answer])
116
+ clearButton.click(fn=clearFunction)
 
 
 
 
 
 
117
 
118
+ # User interface for scannable PDF input
119
  with gr.Blocks() as scannablePdf:
120
  with gr.Row():
121
+ inputFile = gr.File(file_types=[".pdf"], file_count="single", label="Select PDF")
 
 
 
 
122
  with gr.Row():
123
+ question = gr.Textbox(label="Question", placeholder="Enter your question here")
124
+ answer = gr.Textbox(label="Response", interactive=False)
 
 
 
 
 
 
125
  with gr.Row():
126
+ submitButton = gr.Button(value="Submit", variant="primary")
 
 
 
127
  clearButton = gr.ClearButton(
128
+ components=[inputFile, question, answer],
129
+ value="Clear",
130
+ variant="secondary"
131
+ )
132
+ # Define actions for buttons
133
+ submitButton.click(fn=getScannablePdfResponse, inputs=[inputFile, question], outputs=[answer])
134
+ clearButton.click(fn=clearFunction)
135
+
136
+ def getLinksButtonFn(baseUrl: str) -> tuple:
137
+ """
138
+ Fetch links from the specified base URL.
139
+
140
+ Args:
141
+ baseUrl (str): The base URL from which to fetch links.
142
+
143
+ Returns:
144
+ tuple: A tuple containing a CheckboxGroup of fetched links and two rows for the UI.
145
+ """
146
+ links = pipeline.webCrawler.getLinks(url=baseUrl) # Fetch links using the web crawler
147
+ checkboxes = gr.CheckboxGroup(choices=links, label="Fetched Links", visible=True)
148
+ row2 = gr.Row(visible=True)
149
+ row3 = gr.Row(visible=True)
150
+ return checkboxes, row2, row3
151
 
152
+ @spaces.GPU
153
+ def getWebsiteResponse(links: list[str], inputQuery: str) -> str:
154
+ """
155
+ Generate a response based on fetched website links and a query.
156
 
157
+ Args:
158
+ links (list[str]): List of links to process.
159
+ inputQuery (str): The question to be answered.
 
 
 
 
 
 
 
 
 
 
 
160
 
161
+ Returns:
162
+ str: The response generated from the website links.
163
+ """
164
  global chain
165
  if chain is None:
166
+ chain = pipeline.webCrawl(urls=links) # Create a new processing chain for web crawling
167
+ response = chain.invoke({"question": inputQuery})
 
 
 
 
 
 
 
168
  return response
169
 
170
+ def clearWebsiteResponse() -> gr.CheckboxGroup:
171
+ """Clear the website response and reset the checkboxes."""
172
  global chain
173
+ chain = None # Reset the chain
174
+ checkboxes = gr.CheckboxGroup(choices=[], label="Fetched Links", visible=False)
 
 
 
 
175
  return checkboxes
176
 
177
+ # User interface for website crawling
178
  with gr.Blocks() as websiteCrawler:
179
  with gr.Row():
180
  inputUrl = gr.Textbox(
181
+ label="Base URL",
182
+ placeholder="Enter the Base URL to fetch other links",
183
+ scale=3
184
+ )
185
+ getLinksButton = gr.Button(value="Get Links", variant="primary", scale=1)
186
+ checkboxes = gr.CheckboxGroup(choices=[], label="Fetched Links")
187
+ with gr.Row(visible=False) as row2:
188
+ question = gr.Textbox(label="Question", placeholder="Enter your question here")
189
+ answer = gr.Textbox(label="Response", interactive=False)
190
+ with gr.Row(visible=False) as row3:
191
+ submitButton = gr.Button(value="Submit", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  clearButton = gr.ClearButton(
193
+ components=[question, answer],
194
+ value="Clear",
195
+ variant="secondary"
196
  )
197
+ # Define actions for buttons
198
+ getLinksButton.click(fn=getLinksButtonFn, inputs=[inputUrl], outputs=[checkboxes, row2, row3])
199
+ submitButton.click(fn=getWebsiteResponse, inputs=[checkboxes, question], outputs=[answer])
200
+ clearButton.click(fn=clearWebsiteResponse, inputs=None, outputs=[checkboxes])
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  @spaces.GPU
203
+ def getYoutubeResponse(links: str, inputQuery: str) -> str:
204
+ """
205
+ Generate a response based on YouTube video links and a query.
206
+
207
+ Args:
208
+ links (str): Comma-separated YouTube video links.
209
+ inputQuery (str): The question to be answered.
210
+
211
+ Returns:
212
+ str: The response generated from the YouTube videos.
213
+ """
214
  global chain
215
+ links = [link.strip() for link in links.split(",")] # Split and clean the links
216
  if chain is None:
217
+ chain = pipeline.youtubeLinks(urls=links) # Create a new processing chain for YouTube links
218
+ response = chain.invoke({"question": inputQuery})
 
 
 
 
 
 
219
  return response
220
 
221
+ # User interface for YouTube links
222
  with gr.Blocks() as youtubeInterface:
223
  with gr.Row():
224
  inputLinks = gr.Textbox(
225
+ label="Youtube Links",
226
+ placeholder='Enter comma(,)-separated youtube video links'
227
  )
228
  with gr.Row():
229
+ question = gr.Textbox(label="Question", placeholder="Enter your question here")
230
+ answer = gr.Textbox(label="Response", interactive=False)
 
 
 
 
 
 
231
  with gr.Row():
232
+ submitButton = gr.Button(value="Submit", variant="primary")
 
 
 
233
  clearButton = gr.ClearButton(
234
+ components=[inputLinks, question, answer],
235
+ value="Clear",
236
+ variant="secondary"
237
  )
238
+ # Define actions for buttons
239
+ submitButton.click(fn=getYoutubeResponse, inputs=[inputLinks, question], outputs=[answer])
240
+ clearButton.click(fn=clearFunction)
 
 
 
 
 
 
241
 
242
+ # Create a tabbed interface for the different functionalities
243
  application = gr.TabbedInterface(
244
  [textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
245
  ["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
246
  )
247
 
248
+ # Launch the Gradio application
249
  application.launch()
requirements.txt CHANGED
@@ -1,151 +1,17 @@
1
- aiofiles==23.2.1
2
- aiohappyeyeballs==2.4.0
3
- aiohttp==3.10.6
4
- aiosignal==1.3.1
5
- annotated-types==0.7.0
6
- anyio==4.6.0
7
- asttokens==2.4.1
8
- async-timeout==4.0.3
9
- attrs==24.2.0
10
- Authlib==1.3.2
11
  beautifulsoup4==4.12.3
12
- certifi==2024.8.30
13
- cffi==1.17.1
14
- charset-normalizer==3.3.2
15
- click==8.0.4
16
- cryptography==43.0.1
17
- dataclasses-json==0.6.7
18
- datasets==3.0.0
19
- decorator==5.1.1
20
- dill==0.3.8
21
- distro==1.9.0
22
  easyocr==1.7.2
23
- exceptiongroup==1.2.2
24
- executing==2.1.0
25
- fastapi==0.115.0
26
- ffmpy==0.4.0
27
- filelock==3.16.1
28
- frozenlist==1.4.1
29
- fsspec==2024.6.1
30
  gradio==5.0.2
31
- gradio_client==1.4.0
32
- greenlet==3.1.1
33
- groq==0.11.0
34
- h11==0.14.0
35
- hf_transfer==0.1.8
36
- httpcore==1.0.6
37
- httpx==0.27.2
38
- huggingface-hub==0.25.1
39
- idna==3.10
40
- imageio==2.35.1
41
- ipython==8.28.0
42
- itsdangerous==2.2.0
43
- jedi==0.19.1
44
- Jinja2==3.1.4
45
- joblib==1.4.2
46
- jsonpatch==1.33
47
- jsonpointer==3.0.0
48
  langchain==0.3.3
49
  langchain-community==0.3.2
50
  langchain-core==0.3.10
51
  langchain-groq==0.2.0
52
  langchain-huggingface==0.1.0
53
  langchain-text-splitters==0.3.0
54
- langsmith==0.1.134
55
- lazy_loader==0.4
56
- markdown-it-py==3.0.0
57
- MarkupSafe==2.1.5
58
- marshmallow==3.22.0
59
- matplotlib-inline==0.1.7
60
- mdurl==0.1.2
61
- mpmath==1.3.0
62
- multidict==6.1.0
63
- multiprocess==0.70.16
64
- mypy-extensions==1.0.0
65
- networkx==3.3
66
- ninja==1.11.1.1
67
  numpy==1.26.4
68
- nvidia-cublas-cu12==12.1.3.1
69
- nvidia-cuda-cupti-cu12==12.1.105
70
- nvidia-cuda-nvrtc-cu12==12.1.105
71
- nvidia-cuda-runtime-cu12==12.1.105
72
- nvidia-cudnn-cu12==9.1.0.70
73
- nvidia-cufft-cu12==11.0.2.54
74
- nvidia-curand-cu12==10.3.2.106
75
- nvidia-cusolver-cu12==11.4.5.107
76
- nvidia-cusparse-cu12==12.1.0.106
77
- nvidia-nccl-cu12==2.20.5
78
- nvidia-nvjitlink-cu12==12.6.68
79
- nvidia-nvtx-cu12==12.1.105
80
- opencv-python-headless==4.10.0.84
81
- orjson==3.10.7
82
- packaging==24.1
83
- pandas==2.2.3
84
- parso==0.8.4
85
  pdf2image==1.17.0
86
- pexpect==4.9.0
87
- pillow==10.4.0
88
- prompt_toolkit==3.0.48
89
- protobuf==3.20.3
90
- psutil==5.9.8
91
- ptyprocess==0.7.0
92
- pure_eval==0.2.3
93
- pyarrow==17.0.0
94
- pyclipper==1.3.0.post5
95
- pycparser==2.22
96
- pydantic==2.9.2
97
- pydantic-settings==2.5.2
98
- pydantic_core==2.23.4
99
- pydub==0.25.1
100
- Pygments==2.18.0
101
  PyMuPDF==1.24.11
102
- python-bidi==0.6.0
103
- python-dateutil==2.9.0.post0
104
  python-dotenv==1.0.1
105
- python-multipart==0.0.12
106
- pytz==2024.2
107
- PyYAML==6.0.2
108
- regex==2024.9.11
109
  requests==2.32.3
110
- requests-toolbelt==1.0.0
111
- rich==13.9.2
112
- ruff==0.6.9
113
- safetensors==0.4.5
114
- scikit-image==0.24.0
115
- scikit-learn==1.5.2
116
- scipy==1.14.1
117
- semantic-version==2.10.0
118
- sentence-transformers==3.2.0
119
- shapely==2.0.6
120
- shellingham==1.5.4
121
- six==1.16.0
122
- sniffio==1.3.1
123
- soupsieve==2.6
124
- spaces==0.30.3
125
- SQLAlchemy==2.0.35
126
- stack-data==0.6.3
127
- starlette==0.38.6
128
- sympy==1.13.3
129
- tenacity==8.5.0
130
- threadpoolctl==3.5.0
131
- tifffile==2024.9.20
132
- tokenizers==0.20.1
133
- tomlkit==0.12.0
134
- torch==2.4.1
135
- torchvision==0.19.1
136
- tqdm==4.66.5
137
- traitlets==5.14.3
138
- transformers==4.45.2
139
- triton==3.0.0
140
- typer==0.12.5
141
- typing-inspect==0.9.0
142
- typing_extensions==4.12.2
143
- tzdata==2024.2
144
  urllib3==2.2.3
145
- uvicorn==0.31.1
146
- wcwidth==0.2.13
147
- websockets==12.0
148
- xxhash==3.5.0
149
- yarl==1.12.1
150
  youtube-transcript-api==0.6.2
151
  -e .
 
 
 
 
 
 
 
 
 
 
 
1
  beautifulsoup4==4.12.3
 
 
 
 
 
 
 
 
 
 
2
  easyocr==1.7.2
 
 
 
 
 
 
 
3
  gradio==5.0.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  langchain==0.3.3
5
  langchain-community==0.3.2
6
  langchain-core==0.3.10
7
  langchain-groq==0.2.0
8
  langchain-huggingface==0.1.0
9
  langchain-text-splitters==0.3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  numpy==1.26.4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  pdf2image==1.17.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  PyMuPDF==1.24.11
 
 
13
  python-dotenv==1.0.1
 
 
 
 
14
  requests==2.32.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  urllib3==2.2.3
 
 
 
 
 
16
  youtube-transcript-api==0.6.2
17
  -e .
setup.py CHANGED
@@ -1,6 +1,7 @@
1
  from setuptools import setup, find_packages
2
 
3
  HYPEN_E_DOT = "-e ."
 
4
  def getRequirements(requirementsPath: str) -> list[str]:
5
  with open(requirementsPath) as file:
6
  requirements = file.read().split("\n")
@@ -8,10 +9,11 @@ def getRequirements(requirementsPath: str) -> list[str]:
8
  return requirements
9
 
10
  setup(
11
- name = "ConversAI",
12
- author = "Rauhan Ahmed Siddiqui",
13
- author_email = "rauhaan.siddiqui@gmail.com",
14
- version = "0.1",
15
- packages = find_packages(),
16
- install_requires = getRequirements(requirementsPath = "requirements.txt")
 
17
  )
 
1
  from setuptools import setup, find_packages
2
 
3
  HYPEN_E_DOT = "-e ."
4
+
5
  def getRequirements(requirementsPath: str) -> list[str]:
6
  with open(requirementsPath) as file:
7
  requirements = file.read().split("\n")
 
9
  return requirements
10
 
11
  setup(
12
+ name="ConversAI",
13
+ author="Rauhan Ahmed Siddiqui",
14
+ author_email="rauhaan.siddiqui@gmail.com",
15
+ version="0.1",
16
+ packages=find_packages(),
17
+ install_requires=getRequirements(requirementsPath="requirements.txt"),
18
+ description="ConversAI: An innovative conversational AI framework for intelligent text extraction and querying.",
19
  )
src/components/loaders/pdfLoader.py CHANGED
@@ -7,17 +7,37 @@ import numpy as np
7
  import pymupdf
8
  import easyocr
9
 
10
-
11
  class PdfLoader:
12
  def __init__(self) -> None:
13
- self.config = getConfig(path = "config.ini")
14
- self.reader = easyocr.Reader(['en'], gpu = self.config.getboolean("EASYOCR", "gpu"))
15
-
16
- def extractTextFromPage(self, page):
17
- return cleanText(text = page.get_text())
18
-
19
- def searchablePdf(self, pdfPath: str):
20
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  logger.info("Text Extraction Started from Searchable PDF")
22
  doc = pymupdf.open(pdfPath)
23
  pages = [doc.load_page(i) for i in range(len(doc))]
@@ -27,12 +47,30 @@ class PdfLoader:
27
  return "\n".join(texts)
28
  except Exception as e:
29
  logger.error(CustomException(e))
30
-
31
- def getText(self, image):
 
 
 
 
 
 
 
 
 
32
  text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
33
- return cleanText(text = text)
 
 
 
 
 
 
 
34
 
35
- def scannablePdf(self, pdfPath: str):
 
 
36
  try:
37
  logger.info("Text Extraction Started from Scannable PDF")
38
  allImages = convert_from_path(pdfPath)
 
7
  import pymupdf
8
  import easyocr
9
 
 
10
  class PdfLoader:
11
  def __init__(self) -> None:
12
+ """
13
+ Initialize the PdfLoader with configuration settings and an EasyOCR reader.
14
+ """
15
+ self.config = getConfig(path="config.ini")
16
+ self.reader = easyocr.Reader(['en'], gpu=self.config.getboolean("EASYOCR", "gpu"))
17
+
18
+ def extractTextFromPage(self, page) -> str:
19
+ """
20
+ Extract and clean text from a PDF page.
21
+
22
+ Args:
23
+ page: A PyMuPDF page object.
24
+
25
+ Returns:
26
+ str: Cleaned text extracted from the page.
27
+ """
28
+ return cleanText(text=page.get_text())
29
+
30
+ def searchablePdf(self, pdfPath: str) -> str:
31
+ """
32
+ Extract text from a searchable PDF.
33
+
34
+ Args:
35
+ pdfPath (str): The file path to the searchable PDF.
36
+
37
+ Returns:
38
+ str: All extracted text from the PDF.
39
+ """
40
+ try:
41
  logger.info("Text Extraction Started from Searchable PDF")
42
  doc = pymupdf.open(pdfPath)
43
  pages = [doc.load_page(i) for i in range(len(doc))]
 
47
  return "\n".join(texts)
48
  except Exception as e:
49
  logger.error(CustomException(e))
50
+
51
+ def getText(self, image) -> str:
52
+ """
53
+ Extract and clean text from an image using EasyOCR.
54
+
55
+ Args:
56
+ image: An image (numpy array).
57
+
58
+ Returns:
59
+ str: Cleaned text extracted from the image.
60
+ """
61
  text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
62
+ return cleanText(text=text)
63
+
64
+ def scannablePdf(self, pdfPath: str) -> str:
65
+ """
66
+ Extract text from a scannable PDF using OCR.
67
+
68
+ Args:
69
+ pdfPath (str): The file path to the scannable PDF.
70
 
71
+ Returns:
72
+ str: All extracted text from the PDF.
73
+ """
74
  try:
75
  logger.info("Text Extraction Started from Scannable PDF")
76
  allImages = convert_from_path(pdfPath)
src/components/loaders/websiteCrawler.py CHANGED
@@ -1,65 +1,100 @@
1
  from concurrent.futures import ThreadPoolExecutor
2
  from src.utils.exceptions import CustomException
3
  from urllib.parse import urlparse, urljoin
4
- from src.utils.functions import getConfig
5
- from src.utils.functions import cleanText
6
  from src.utils.logging import logger
7
  from bs4 import BeautifulSoup
8
  import time
9
  import requests
10
 
11
-
12
  class WebsiteCrawler:
13
  def __init__(self):
14
- self.config = getConfig(path = "config.ini")
 
 
 
 
 
15
 
16
- def getLinksFromPage(self, url: str):
 
 
 
 
 
17
  response = requests.get(url)
18
  soup = BeautifulSoup(response.content, "html.parser")
19
  anchors = soup.find_all("a")
20
  links = []
 
21
  for anchor in anchors:
22
  if "href" in anchor.attrs:
23
  if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
24
  links.append(anchor.attrs["href"])
25
  elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
26
  links.append(urljoin(url + "/", anchor.attrs["href"]))
27
- else:
28
- pass
29
  links = [link for link in links if "#" not in link]
30
  links = list(set(links))
31
- else:
32
- continue
33
  return links
34
-
35
- def getLinks(self, url: str):
 
 
 
 
 
 
 
 
 
36
  try:
37
- logger.info("fetching links from url")
38
  start = time.time()
39
  links = self.getLinksFromPage(url)
40
  uniqueLinks = set()
 
41
  for link in links:
42
  now = time.time()
43
  if now - start > self.config.getint("WEBCRAWLER", "timeout"):
44
  break
45
- else:
46
- uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
47
- return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
48
  except Exception as e:
49
  logger.error(CustomException(e))
50
 
51
- def extractTextFromUrl(self, url: str):
 
 
 
 
 
 
 
 
 
52
  response = requests.get(url)
53
  response.raise_for_status()
54
  html = response.text
55
  soup = BeautifulSoup(html, 'html.parser')
56
- return cleanText(text = soup.get_text(separator=' ', strip=True))
 
 
 
 
 
 
 
57
 
58
- def extractTextFromUrlList(self, urls: list[str]):
 
 
59
  try:
60
- logger.info("extracting text from urls")
61
  with ThreadPoolExecutor() as executor:
62
  texts = list(executor.map(self.extractTextFromUrl, urls))
63
- return "\n".join(texts)
64
  except Exception as e:
65
  logger.error(CustomException(e))
 
1
  from concurrent.futures import ThreadPoolExecutor
2
  from src.utils.exceptions import CustomException
3
  from urllib.parse import urlparse, urljoin
4
+ from src.utils.functions import getConfig, cleanText
 
5
  from src.utils.logging import logger
6
  from bs4 import BeautifulSoup
7
  import time
8
  import requests
9
 
 
10
  class WebsiteCrawler:
11
  def __init__(self):
12
+ """Initialize the WebsiteCrawler with configuration settings."""
13
+ self.config = getConfig(path="config.ini")
14
+
15
+ def getLinksFromPage(self, url: str) -> list[str]:
16
+ """
17
+ Extract all valid links from a given webpage.
18
 
19
+ Args:
20
+ url (str): The URL of the webpage to extract links from.
21
+
22
+ Returns:
23
+ list[str]: A list of extracted links from the page.
24
+ """
25
  response = requests.get(url)
26
  soup = BeautifulSoup(response.content, "html.parser")
27
  anchors = soup.find_all("a")
28
  links = []
29
+
30
  for anchor in anchors:
31
  if "href" in anchor.attrs:
32
  if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
33
  links.append(anchor.attrs["href"])
34
  elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
35
  links.append(urljoin(url + "/", anchor.attrs["href"]))
36
+
 
37
  links = [link for link in links if "#" not in link]
38
  links = list(set(links))
39
+
 
40
  return links
41
+
42
+ def getLinks(self, url: str) -> list[str]:
43
+ """
44
+ Fetch and return all unique links found from the given URL.
45
+
46
+ Args:
47
+ url (str): The starting URL to fetch links from.
48
+
49
+ Returns:
50
+ list[str]: A list of unique links found.
51
+ """
52
  try:
53
+ logger.info("Fetching links from URL")
54
  start = time.time()
55
  links = self.getLinksFromPage(url)
56
  uniqueLinks = set()
57
+
58
  for link in links:
59
  now = time.time()
60
  if now - start > self.config.getint("WEBCRAWLER", "timeout"):
61
  break
62
+ uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
63
+
64
+ return list(set([x[:-1] if x[-1] == "/" else x for x in uniqueLinks]))
65
  except Exception as e:
66
  logger.error(CustomException(e))
67
 
68
+ def extractTextFromUrl(self, url: str) -> str:
69
+ """
70
+ Extract and clean text content from a given URL.
71
+
72
+ Args:
73
+ url (str): The URL of the webpage to extract text from.
74
+
75
+ Returns:
76
+ str: Cleaned text extracted from the webpage.
77
+ """
78
  response = requests.get(url)
79
  response.raise_for_status()
80
  html = response.text
81
  soup = BeautifulSoup(html, 'html.parser')
82
+ return cleanText(text=soup.get_text(separator=' ', strip=True))
83
+
84
+ def extractTextFromUrlList(self, urls: list[str]) -> str:
85
+ """
86
+ Extract text from a list of URLs concurrently.
87
+
88
+ Args:
89
+ urls (list[str]): A list of URLs to extract text from.
90
 
91
+ Returns:
92
+ str: All extracted text combined into a single string.
93
+ """
94
  try:
95
+ logger.info("Extracting text from URLs")
96
  with ThreadPoolExecutor() as executor:
97
  texts = list(executor.map(self.extractTextFromUrl, urls))
98
+ return "\n".join(texts)
99
  except Exception as e:
100
  logger.error(CustomException(e))
src/components/loaders/youtubeLoader.py CHANGED
@@ -3,22 +3,29 @@ from src.utils.exceptions import CustomException
3
  from src.utils.functions import cleanText
4
  from src.utils.logging import logger
5
 
6
-
7
  class YoutubeTranscriptLoader:
8
  def __init__(self):
 
9
  pass
10
 
11
- def getTranscripts(self, urls: str):
 
 
 
 
 
 
 
 
 
12
  texts = []
13
  for url in set(urls):
14
  try:
15
- loader = YoutubeLoader.from_youtube_url(
16
- url, add_video_info=False
17
- )
18
  doc = " ".join([x.page_content for x in loader.load()])
19
- texts.append(cleanText(text = doc))
20
  except Exception as e:
21
  logger.error(CustomException(e))
22
- doc = ""
23
- texts.append(doc)
24
  return "\n".join(texts)
 
3
  from src.utils.functions import cleanText
4
  from src.utils.logging import logger
5
 
 
6
  class YoutubeTranscriptLoader:
7
  def __init__(self):
8
+ """Initialize the YoutubeTranscriptLoader."""
9
  pass
10
 
11
+ def getTranscripts(self, urls: str) -> str:
12
+ """
13
+ Retrieve transcripts from a list of YouTube URLs.
14
+
15
+ Args:
16
+ urls (str): Comma-separated YouTube URLs to fetch transcripts from.
17
+
18
+ Returns:
19
+ str: Combined transcripts cleaned and joined by newlines.
20
+ """
21
  texts = []
22
  for url in set(urls):
23
  try:
24
+ loader = YoutubeLoader.from_youtube_url(url, add_video_info=False)
 
 
25
  doc = " ".join([x.page_content for x in loader.load()])
26
+ texts.append(cleanText(text=doc))
27
  except Exception as e:
28
  logger.error(CustomException(e))
29
+ texts.append("") # Append an empty string on error
30
+
31
  return "\n".join(texts)
src/components/rag/RAG.py CHANGED
@@ -3,39 +3,52 @@ from langchain_core.output_parsers import StrOutputParser
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_core.runnables import RunnableLambda
5
  from src.utils.exceptions import CustomException
6
- from src.utils.functions import getConfig
7
- from src.utils.functions import loadYaml
8
  from src.utils.logging import logger
9
  from langchain_groq import ChatGroq
10
 
11
-
12
  class Chain:
13
  def __init__(self):
14
- self.config = getConfig(path = "config.ini")
 
15
  self.store = VectorStore()
16
- prompt = loadYaml(path = "params.yaml")["prompt"]
17
  self.prompt = ChatPromptTemplate.from_template(prompt)
18
 
19
- def formatDocs(self, docs):
20
- context = ""
21
- for doc in docs:
22
- context += f"{doc}\n\n\n"
23
- if context == "":
24
- context = "No Context Found"
25
- else:
26
- pass
 
 
 
27
  return context
28
 
29
  def returnChain(self, text: str):
 
 
 
 
 
 
 
 
 
30
  try:
31
- logger.info("preparing chain")
32
- store = self.store.setupStore(text = text)
33
  chain = (
34
- {"context": RunnableLambda(lambda x: x["question"]) | store | RunnableLambda(self.formatDocs),
35
- "question": RunnableLambda(lambda x: x["question"])}
36
- | self.prompt
37
- | ChatGroq(model_name = self.config.get("LLM", "llmModel"), temperature = self.config.getfloat("LLM", "temperature"), max_tokens = self.config.getint("LLM", "maxTokens"))
38
- | StrOutputParser()
 
 
39
  )
40
  return chain
41
  except Exception as e:
 
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_core.runnables import RunnableLambda
5
  from src.utils.exceptions import CustomException
6
+ from src.utils.functions import getConfig, loadYaml
 
7
  from src.utils.logging import logger
8
  from langchain_groq import ChatGroq
9
 
 
10
  class Chain:
11
  def __init__(self):
12
+ """Initialize the Chain with configuration and prompt template."""
13
+ self.config = getConfig(path="config.ini")
14
  self.store = VectorStore()
15
+ prompt = loadYaml(path="params.yaml")["prompt"]
16
  self.prompt = ChatPromptTemplate.from_template(prompt)
17
 
18
+ def formatDocs(self, docs) -> str:
19
+ """
20
+ Format a list of documents into a single string.
21
+
22
+ Args:
23
+ docs: A list of documents to format.
24
+
25
+ Returns:
26
+ str: Formatted string with documents or a placeholder if empty.
27
+ """
28
+ context = "\n\n\n".join(docs) or "No Context Found"
29
  return context
30
 
31
  def returnChain(self, text: str):
32
+ """
33
+ Create and return a processing chain based on the input text.
34
+
35
+ Args:
36
+ text (str): Input text to prepare the chain.
37
+
38
+ Returns:
39
+ Chain: Configured chain for processing input.
40
+ """
41
  try:
42
+ logger.info("Preparing chain")
43
+ store = self.store.setupStore(text=text)
44
  chain = (
45
+ {"context": RunnableLambda(lambda x: x["question"]) | store | RunnableLambda(self.formatDocs),
46
+ "question": RunnableLambda(lambda x: x["question"])}
47
+ | self.prompt
48
+ | ChatGroq(model_name=self.config.get("LLM", "llmModel"),
49
+ temperature=self.config.getfloat("LLM", "temperature"),
50
+ max_tokens=self.config.getint("LLM", "maxTokens"))
51
+ | StrOutputParser()
52
  )
53
  return chain
54
  except Exception as e:
src/components/vectors/vectorstore.py CHANGED
@@ -8,31 +8,41 @@ from src.utils.logging import logger
8
 
9
  class VectorStore:
10
  def __init__(self):
11
- self.config = getConfig(path = "config.ini")
 
12
  self.vectorEmbeddings = HuggingFaceEmbeddings(
13
- model_name = self.config.get("EMBEDDINGS", "embeddingModel"),
14
- model_kwargs = {"device": self.config.get("EMBEDDINGS", "device")},
15
- encode_kwargs = {"normalize_embeddings": self.config.getboolean("EMBEDDINGS", "normalize_embeddings")}
16
  )
17
  self.splitter = RecursiveCharacterTextSplitter(
18
- chunk_size = self.config.getint("VECTORSTORE", "chunkSize"),
19
- chunk_overlap = self.config.getint("VECTORSTORE", "chunkOverlap"),
20
- add_start_index = self.config.getboolean("VECTORSTORE", "addStartIndex")
21
  )
22
 
23
  def setupStore(self, text: str):
 
 
 
 
 
 
 
 
 
24
  try:
25
  store = InMemoryVectorStore(self.vectorEmbeddings)
26
- textDocument = Document(page_content = text)
27
  documents = self.splitter.split_documents([textDocument])
28
- store.add_documents(documents = documents)
29
  return store.as_retriever(
30
- search_type = self.config.get("RETRIEVER", "searchType"),
31
- search_kwargs = {
32
  "k": self.config.getint("RETRIEVER", "k"),
33
- "fetch_k": self.config.getint("RETRIEVER", "fetchK")
34
  }
35
- )
36
  except Exception as e:
37
- print(CustomException(e))
38
- logger.error(CustomException(e))
 
8
 
9
  class VectorStore:
10
  def __init__(self):
11
+ """Initialize the VectorStore with configuration, embeddings, and text splitter."""
12
+ self.config = getConfig(path="config.ini")
13
  self.vectorEmbeddings = HuggingFaceEmbeddings(
14
+ model_name=self.config.get("EMBEDDINGS", "embeddingModel"),
15
+ model_kwargs={"device": self.config.get("EMBEDDINGS", "device")},
16
+ encode_kwargs={"normalize_embeddings": self.config.getboolean("EMBEDDINGS", "normalize_embeddings")}
17
  )
18
  self.splitter = RecursiveCharacterTextSplitter(
19
+ chunk_size=self.config.getint("VECTORSTORE", "chunkSize"),
20
+ chunk_overlap=self.config.getint("VECTORSTORE", "chunkOverlap"),
21
+ add_start_index=self.config.getboolean("VECTORSTORE", "addStartIndex")
22
  )
23
 
24
  def setupStore(self, text: str):
25
+ """
26
+ Set up the vector store with the provided text.
27
+
28
+ Args:
29
+ text (str): The text to store and process.
30
+
31
+ Returns:
32
+ Retriever: A retriever for querying the vector store.
33
+ """
34
  try:
35
  store = InMemoryVectorStore(self.vectorEmbeddings)
36
+ textDocument = Document(page_content=text)
37
  documents = self.splitter.split_documents([textDocument])
38
+ store.add_documents(documents=documents)
39
  return store.as_retriever(
40
+ search_type=self.config.get("RETRIEVER", "searchType"),
41
+ search_kwargs={
42
  "k": self.config.getint("RETRIEVER", "k"),
43
+ "fetch_k": self.config.getint("RETRIEVER", "fetchK")
44
  }
45
+ )
46
  except Exception as e:
47
+ logger.error(CustomException(e))
48
+ print(CustomException(e))
src/pipelines/completePipeline.py CHANGED
@@ -8,31 +8,77 @@ load_dotenv("secrets.env")
8
 
9
  class Pipeline:
10
  def __init__(self):
 
11
  self.pdfLoader = PdfLoader()
12
  self.webCrawler = WebsiteCrawler()
13
  self.youtubeLoader = YoutubeTranscriptLoader()
14
  self.ragChain = Chain()
15
-
16
  def plainText(self, text: str):
17
- chain = self.ragChain.returnChain(text = text)
 
 
 
 
 
 
 
 
 
18
  return chain
19
 
20
  def searchablePdf(self, path: str):
21
- extractedText = self.pdfLoader.searchablePdf(pdfPath = path)
22
- chain = self.ragChain.returnChain(text = extractedText)
 
 
 
 
 
 
 
 
 
23
  return chain
24
 
25
  def scannablePdf(self, path: str):
26
- extractedText = self.pdfLoader.scannablePdf(pdfPath = path)
27
- chain = self.ragChain.returnChain(text = extractedText)
 
 
 
 
 
 
 
 
 
28
  return chain
29
 
30
  def webCrawl(self, urls: list[str]):
31
- extractedText = self.webCrawler.extractTextFromUrlList(urls = urls)
32
- chain = self.ragChain.returnChain(text = extractedText)
 
 
 
 
 
 
 
 
 
33
  return chain
34
 
35
  def youtubeLinks(self, urls: list[str]):
36
- extractedText = self.youtubeLoader.getTranscripts(urls = urls)
37
- chain = self.ragChain.returnChain(text = extractedText)
38
- return chain
 
 
 
 
 
 
 
 
 
 
8
 
9
  class Pipeline:
10
  def __init__(self):
11
+ """Initialize the Pipeline with loaders and the RAG chain."""
12
  self.pdfLoader = PdfLoader()
13
  self.webCrawler = WebsiteCrawler()
14
  self.youtubeLoader = YoutubeTranscriptLoader()
15
  self.ragChain = Chain()
16
+
17
  def plainText(self, text: str):
18
+ """
19
+ Process plain text through the RAG chain.
20
+
21
+ Args:
22
+ text (str): The input text to process.
23
+
24
+ Returns:
25
+ Chain: The processed chain for the input text.
26
+ """
27
+ chain = self.ragChain.returnChain(text=text)
28
  return chain
29
 
30
  def searchablePdf(self, path: str):
31
+ """
32
+ Process a searchable PDF file.
33
+
34
+ Args:
35
+ path (str): The path to the PDF file.
36
+
37
+ Returns:
38
+ Chain: The processed chain from the extracted text.
39
+ """
40
+ extractedText = self.pdfLoader.searchablePdf(pdfPath=path)
41
+ chain = self.ragChain.returnChain(text=extractedText)
42
  return chain
43
 
44
  def scannablePdf(self, path: str):
45
+ """
46
+ Process a scannable PDF file.
47
+
48
+ Args:
49
+ path (str): The path to the PDF file.
50
+
51
+ Returns:
52
+ Chain: The processed chain from the extracted text.
53
+ """
54
+ extractedText = self.pdfLoader.scannablePdf(pdfPath=path)
55
+ chain = self.ragChain.returnChain(text=extractedText)
56
  return chain
57
 
58
  def webCrawl(self, urls: list[str]):
59
+ """
60
+ Crawl the web for text extraction from provided URLs.
61
+
62
+ Args:
63
+ urls (list[str]): A list of URLs to crawl.
64
+
65
+ Returns:
66
+ Chain: The processed chain from the extracted text.
67
+ """
68
+ extractedText = self.webCrawler.extractTextFromUrlList(urls=urls)
69
+ chain = self.ragChain.returnChain(text=extractedText)
70
  return chain
71
 
72
  def youtubeLinks(self, urls: list[str]):
73
+ """
74
+ Extract transcripts from YouTube links.
75
+
76
+ Args:
77
+ urls (list[str]): A list of YouTube video URLs.
78
+
79
+ Returns:
80
+ Chain: The processed chain from the extracted transcripts.
81
+ """
82
+ extractedText = self.youtubeLoader.getTranscripts(urls=urls)
83
+ chain = self.ragChain.returnChain(text=extractedText)
84
+ return chain
src/utils/exceptions.py CHANGED
@@ -1,6 +1,15 @@
1
  import sys
2
 
3
  def error_message_detail(error):
 
 
 
 
 
 
 
 
 
4
  _, _, exc_info = sys.exc_info()
5
  filename = exc_info.tb_frame.f_code.co_filename
6
  lineno = exc_info.tb_lineno
@@ -9,8 +18,15 @@ def error_message_detail(error):
9
 
10
  class CustomException(Exception):
11
  def __init__(self, error_message):
 
 
 
 
 
 
12
  super().__init__(error_message)
13
  self.error_message = error_message_detail(error_message)
14
 
15
  def __str__(self) -> str:
 
16
  return self.error_message
 
1
  import sys
2
 
3
  def error_message_detail(error):
4
+ """
5
+ Generate a detailed error message.
6
+
7
+ Args:
8
+ error: The error object.
9
+
10
+ Returns:
11
+ str: A formatted error message including line number and filename.
12
+ """
13
  _, _, exc_info = sys.exc_info()
14
  filename = exc_info.tb_frame.f_code.co_filename
15
  lineno = exc_info.tb_lineno
 
18
 
19
  class CustomException(Exception):
20
  def __init__(self, error_message):
21
+ """
22
+ Initialize a CustomException with a detailed error message.
23
+
24
+ Args:
25
+ error_message (str): The error message to be logged.
26
+ """
27
  super().__init__(error_message)
28
  self.error_message = error_message_detail(error_message)
29
 
30
  def __str__(self) -> str:
31
+ """Return the detailed error message."""
32
  return self.error_message
src/utils/functions.py CHANGED
@@ -3,15 +3,42 @@ import string
3
  import yaml
4
 
5
  def getConfig(path: str):
 
 
 
 
 
 
 
 
 
6
  config = configparser.ConfigParser()
7
  config.read(path)
8
  return config
9
 
10
  def cleanText(text: str):
 
 
 
 
 
 
 
 
 
11
  text = text.replace("\n", " ")
12
  text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
13
  return text
14
 
15
  def loadYaml(path: str):
 
 
 
 
 
 
 
 
 
16
  with open(path) as file:
17
  return yaml.safe_load(file)
 
3
  import yaml
4
 
5
  def getConfig(path: str):
6
+ """
7
+ Load configuration from a specified file.
8
+
9
+ Args:
10
+ path (str): The path to the configuration file.
11
+
12
+ Returns:
13
+ ConfigParser: The loaded configuration object.
14
+ """
15
  config = configparser.ConfigParser()
16
  config.read(path)
17
  return config
18
 
19
  def cleanText(text: str):
20
+ """
21
+ Clean the input text by removing newline characters and punctuation.
22
+
23
+ Args:
24
+ text (str): The text to be cleaned.
25
+
26
+ Returns:
27
+ str: The cleaned text.
28
+ """
29
  text = text.replace("\n", " ")
30
  text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
31
  return text
32
 
33
  def loadYaml(path: str):
34
+ """
35
+ Load YAML content from a specified file.
36
+
37
+ Args:
38
+ path (str): The path to the YAML file.
39
+
40
+ Returns:
41
+ dict: The parsed content of the YAML file.
42
+ """
43
  with open(path) as file:
44
  return yaml.safe_load(file)
src/utils/logging.py CHANGED
@@ -1,12 +1,16 @@
1
  import logging
2
 
 
3
  logger = logging.getLogger(__name__)
4
  logger.setLevel(logging.INFO)
5
 
 
6
  logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
7
- logFormatter = logging.Formatter(fmt = logFormat, style = "%")
8
 
 
9
  streamHandler = logging.StreamHandler()
10
  streamHandler.setFormatter(logFormatter)
11
 
 
12
  logger.addHandler(streamHandler)
 
1
  import logging
2
 
3
+ # Set up the logger for the current module
4
  logger = logging.getLogger(__name__)
5
  logger.setLevel(logging.INFO)
6
 
7
+ # Define the log format
8
  logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
9
+ logFormatter = logging.Formatter(fmt=logFormat, style="%")
10
 
11
+ # Set up a stream handler to output logs to the console
12
  streamHandler = logging.StreamHandler()
13
  streamHandler.setFormatter(logFormatter)
14
 
15
+ # Add the stream handler to the logger
16
  logger.addHandler(streamHandler)