Spaces:
Sleeping
Sleeping
Commit ·
1802405
1
Parent(s): d14c53b
UPDATE: code update
Browse files- app.py +163 -228
- requirements.txt +0 -134
- setup.py +8 -6
- src/components/loaders/pdfLoader.py +51 -13
- src/components/loaders/websiteCrawler.py +55 -20
- src/components/loaders/youtubeLoader.py +15 -8
- src/components/rag/RAG.py +33 -20
- src/components/vectors/vectorstore.py +25 -15
- src/pipelines/completePipeline.py +57 -11
- src/utils/exceptions.py +16 -0
- src/utils/functions.py +27 -0
- src/utils/logging.py +5 -1
app.py
CHANGED
|
@@ -1,314 +1,249 @@
|
|
|
|
|
| 1 |
from src.pipelines.completePipeline import Pipeline
|
| 2 |
import gradio as gr
|
| 3 |
import spaces
|
| 4 |
-
import os
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
chain = None
|
| 11 |
-
pipeline = Pipeline()
|
| 12 |
|
| 13 |
@spaces.GPU
|
| 14 |
-
def getTextResponse(text: str, inputQuery: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
global chain
|
| 16 |
if chain is None:
|
| 17 |
-
chain = pipeline.plainText(text
|
| 18 |
-
|
| 19 |
-
pass
|
| 20 |
-
response = chain.invoke(
|
| 21 |
-
{
|
| 22 |
-
"question": inputQuery
|
| 23 |
-
}
|
| 24 |
-
)
|
| 25 |
return response
|
| 26 |
|
| 27 |
-
|
| 28 |
@spaces.GPU
|
| 29 |
-
def getSearchablePdfResponse(path: str, inputQuery: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
global chain
|
| 31 |
if chain is None:
|
| 32 |
-
chain = pipeline.searchablePdf(path
|
| 33 |
-
|
| 34 |
-
pass
|
| 35 |
-
response = chain.invoke(
|
| 36 |
-
{
|
| 37 |
-
"question": inputQuery
|
| 38 |
-
}
|
| 39 |
-
)
|
| 40 |
return response
|
| 41 |
|
| 42 |
@spaces.GPU
|
| 43 |
-
def getScannablePdfResponse(path: str, inputQuery: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
global chain
|
| 45 |
if chain is None:
|
| 46 |
-
chain = pipeline.scannablePdf(path
|
| 47 |
-
|
| 48 |
-
pass
|
| 49 |
-
response = chain.invoke(
|
| 50 |
-
{
|
| 51 |
-
"question": inputQuery
|
| 52 |
-
}
|
| 53 |
-
)
|
| 54 |
return response
|
| 55 |
|
| 56 |
-
def clearFunction():
|
|
|
|
| 57 |
global chain
|
| 58 |
chain = None
|
| 59 |
|
|
|
|
| 60 |
with gr.Blocks() as textInterface:
|
| 61 |
with gr.Row():
|
| 62 |
inputText = gr.Textbox(
|
| 63 |
-
label
|
| 64 |
-
placeholder
|
| 65 |
)
|
| 66 |
with gr.Row():
|
| 67 |
question = gr.Textbox(
|
| 68 |
-
label
|
| 69 |
-
placeholder
|
| 70 |
)
|
| 71 |
answer = gr.Textbox(
|
| 72 |
-
label
|
| 73 |
-
interactive
|
| 74 |
)
|
| 75 |
with gr.Row():
|
| 76 |
-
submitButton = gr.Button(
|
| 77 |
-
value = "Submit",
|
| 78 |
-
variant = "primary"
|
| 79 |
-
)
|
| 80 |
clearButton = gr.ClearButton(
|
| 81 |
-
components
|
| 82 |
-
value
|
| 83 |
-
variant
|
| 84 |
)
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
outputs = [answer]
|
| 89 |
-
)
|
| 90 |
-
clearButton.click(
|
| 91 |
-
fn = clearFunction
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
|
|
|
|
| 95 |
with gr.Blocks() as searchablePdf:
|
| 96 |
with gr.Row():
|
| 97 |
inputFile = gr.File(
|
| 98 |
-
file_types
|
| 99 |
-
file_count
|
| 100 |
-
label
|
| 101 |
)
|
| 102 |
with gr.Row():
|
| 103 |
-
question = gr.Textbox(
|
| 104 |
-
|
| 105 |
-
placeholder = "Enter your question here"
|
| 106 |
-
)
|
| 107 |
-
answer = gr.Textbox(
|
| 108 |
-
label = "Response",
|
| 109 |
-
interactive = False
|
| 110 |
-
)
|
| 111 |
with gr.Row():
|
| 112 |
-
submitButton = gr.Button(
|
| 113 |
-
value = "Submit",
|
| 114 |
-
variant = "primary"
|
| 115 |
-
)
|
| 116 |
clearButton = gr.ClearButton(
|
| 117 |
-
components
|
| 118 |
-
value
|
| 119 |
-
variant
|
| 120 |
)
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
outputs = [answer]
|
| 125 |
-
)
|
| 126 |
-
clearButton.click(
|
| 127 |
-
fn = clearFunction
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
|
|
|
|
| 131 |
with gr.Blocks() as scannablePdf:
|
| 132 |
with gr.Row():
|
| 133 |
-
inputFile = gr.File(
|
| 134 |
-
file_types = [".pdf"],
|
| 135 |
-
file_count = "single",
|
| 136 |
-
label = "Select PDF"
|
| 137 |
-
)
|
| 138 |
with gr.Row():
|
| 139 |
-
question = gr.Textbox(
|
| 140 |
-
|
| 141 |
-
placeholder = "Enter your question here"
|
| 142 |
-
)
|
| 143 |
-
answer = gr.Textbox(
|
| 144 |
-
label = "Response",
|
| 145 |
-
interactive = False
|
| 146 |
-
)
|
| 147 |
with gr.Row():
|
| 148 |
-
submitButton = gr.Button(
|
| 149 |
-
value = "Submit",
|
| 150 |
-
variant = "primary"
|
| 151 |
-
)
|
| 152 |
clearButton = gr.ClearButton(
|
| 153 |
-
components
|
| 154 |
-
value
|
| 155 |
-
variant
|
| 156 |
-
)
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
choices = links,
|
| 171 |
-
label = "Fetched Links",
|
| 172 |
-
visible = True
|
| 173 |
-
)
|
| 174 |
-
row2 = gr.Row(visible = True)
|
| 175 |
-
row3 = gr.Row(visible = True)
|
| 176 |
-
return (
|
| 177 |
-
checkboxes,
|
| 178 |
-
row2,
|
| 179 |
-
row3
|
| 180 |
-
)
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
|
|
|
| 184 |
global chain
|
| 185 |
if chain is None:
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
else:
|
| 189 |
-
pass
|
| 190 |
-
response = chain.invoke(
|
| 191 |
-
{
|
| 192 |
-
"question": inputQuery
|
| 193 |
-
}
|
| 194 |
-
)
|
| 195 |
return response
|
| 196 |
|
| 197 |
-
def clearWebsiteResponse():
|
|
|
|
| 198 |
global chain
|
| 199 |
-
chain = None
|
| 200 |
-
checkboxes = gr.CheckboxGroup(
|
| 201 |
-
choices = [],
|
| 202 |
-
label = "Fetched Links",
|
| 203 |
-
visible = False
|
| 204 |
-
)
|
| 205 |
return checkboxes
|
| 206 |
|
|
|
|
| 207 |
with gr.Blocks() as websiteCrawler:
|
| 208 |
with gr.Row():
|
| 209 |
inputUrl = gr.Textbox(
|
| 210 |
-
label
|
| 211 |
-
placeholder
|
| 212 |
-
scale
|
| 213 |
-
)
|
| 214 |
-
getLinksButton = gr.Button(
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
)
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
label = "Fetched Links",
|
| 222 |
-
)
|
| 223 |
-
with gr.Row(visible = False) as row2:
|
| 224 |
-
question = gr.Textbox(
|
| 225 |
-
label = "Question",
|
| 226 |
-
placeholder = "Enter your question here"
|
| 227 |
-
)
|
| 228 |
-
answer = gr.Textbox(
|
| 229 |
-
label = "Response",
|
| 230 |
-
interactive = False
|
| 231 |
-
)
|
| 232 |
-
with gr.Row(visible = False) as row3:
|
| 233 |
-
submitButton = gr.Button(
|
| 234 |
-
value = "Submit",
|
| 235 |
-
variant = "primary"
|
| 236 |
-
)
|
| 237 |
clearButton = gr.ClearButton(
|
| 238 |
-
components
|
| 239 |
-
value
|
| 240 |
-
variant
|
| 241 |
)
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
)
|
| 247 |
-
submitButton.click(
|
| 248 |
-
fn = getWebsiteResponse,
|
| 249 |
-
inputs = [checkboxes, question],
|
| 250 |
-
outputs = [answer]
|
| 251 |
-
)
|
| 252 |
-
clearButton.click(
|
| 253 |
-
fn = clearWebsiteResponse,
|
| 254 |
-
inputs = None,
|
| 255 |
-
outputs = [checkboxes]
|
| 256 |
-
)
|
| 257 |
|
| 258 |
@spaces.GPU
|
| 259 |
-
def getYoutubeResponse(links: str, inputQuery: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
global chain
|
| 261 |
-
links = [link.strip() for link in links.split(",")]
|
| 262 |
if chain is None:
|
| 263 |
-
chain = pipeline.youtubeLinks(urls
|
| 264 |
-
|
| 265 |
-
pass
|
| 266 |
-
response = chain.invoke(
|
| 267 |
-
{
|
| 268 |
-
"question": inputQuery
|
| 269 |
-
}
|
| 270 |
-
)
|
| 271 |
return response
|
| 272 |
|
| 273 |
-
|
| 274 |
with gr.Blocks() as youtubeInterface:
|
| 275 |
with gr.Row():
|
| 276 |
inputLinks = gr.Textbox(
|
| 277 |
-
label
|
| 278 |
-
placeholder
|
| 279 |
)
|
| 280 |
with gr.Row():
|
| 281 |
-
question = gr.Textbox(
|
| 282 |
-
|
| 283 |
-
placeholder = "Enter your question here"
|
| 284 |
-
)
|
| 285 |
-
answer = gr.Textbox(
|
| 286 |
-
label = "Response",
|
| 287 |
-
interactive = False
|
| 288 |
-
)
|
| 289 |
with gr.Row():
|
| 290 |
-
submitButton = gr.Button(
|
| 291 |
-
value = "Submit",
|
| 292 |
-
variant = "primary"
|
| 293 |
-
)
|
| 294 |
clearButton = gr.ClearButton(
|
| 295 |
-
components
|
| 296 |
-
value
|
| 297 |
-
variant
|
| 298 |
)
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
outputs = [answer]
|
| 303 |
-
)
|
| 304 |
-
clearButton.click(
|
| 305 |
-
fn = clearFunction
|
| 306 |
-
)
|
| 307 |
-
|
| 308 |
|
|
|
|
| 309 |
application = gr.TabbedInterface(
|
| 310 |
[textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
|
| 311 |
["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
|
| 312 |
)
|
| 313 |
|
|
|
|
| 314 |
application.launch()
|
|
|
|
| 1 |
+
# Import necessary libraries and modules
|
| 2 |
from src.pipelines.completePipeline import Pipeline
|
| 3 |
import gradio as gr
|
| 4 |
import spaces
|
|
|
|
| 5 |
|
| 6 |
+
# Initialize global variables
|
| 7 |
+
chain = None # Holds the current processing chain
|
| 8 |
+
pipeline = Pipeline() # Instantiate the processing pipeline
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
@spaces.GPU
|
| 11 |
+
def getTextResponse(text: str, inputQuery: str) -> str:
|
| 12 |
+
"""
|
| 13 |
+
Generate a response based on the input text and query.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
text (str): The input text to process.
|
| 17 |
+
inputQuery (str): The question to be answered.
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
str: The response generated from the input text.
|
| 21 |
+
"""
|
| 22 |
global chain
|
| 23 |
if chain is None:
|
| 24 |
+
chain = pipeline.plainText(text=text) # Create a new processing chain for plain text
|
| 25 |
+
response = chain.invoke({"question": inputQuery}) # Process the query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
return response
|
| 27 |
|
|
|
|
| 28 |
@spaces.GPU
|
| 29 |
+
def getSearchablePdfResponse(path: str, inputQuery: str) -> str:
|
| 30 |
+
"""
|
| 31 |
+
Generate a response based on a searchable PDF and query.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
path (str): Path to the searchable PDF.
|
| 35 |
+
inputQuery (str): The question to be answered.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
str: The response generated from the searchable PDF.
|
| 39 |
+
"""
|
| 40 |
global chain
|
| 41 |
if chain is None:
|
| 42 |
+
chain = pipeline.searchablePdf(path=path) # Create a new processing chain for the PDF
|
| 43 |
+
response = chain.invoke({"question": inputQuery})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
return response
|
| 45 |
|
| 46 |
@spaces.GPU
|
| 47 |
+
def getScannablePdfResponse(path: str, inputQuery: str) -> str:
|
| 48 |
+
"""
|
| 49 |
+
Generate a response based on a scannable PDF and query.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
path (str): Path to the scannable PDF.
|
| 53 |
+
inputQuery (str): The question to be answered.
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
str: The response generated from the scannable PDF.
|
| 57 |
+
"""
|
| 58 |
global chain
|
| 59 |
if chain is None:
|
| 60 |
+
chain = pipeline.scannablePdf(path=path) # Create a new processing chain for the scannable PDF
|
| 61 |
+
response = chain.invoke({"question": inputQuery})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
return response
|
| 63 |
|
| 64 |
+
def clearFunction() -> None:
|
| 65 |
+
"""Reset the processing chain to prepare for new queries."""
|
| 66 |
global chain
|
| 67 |
chain = None
|
| 68 |
|
| 69 |
+
# User interface for text input
|
| 70 |
with gr.Blocks() as textInterface:
|
| 71 |
with gr.Row():
|
| 72 |
inputText = gr.Textbox(
|
| 73 |
+
label="Input Text",
|
| 74 |
+
placeholder="Enter your text here"
|
| 75 |
)
|
| 76 |
with gr.Row():
|
| 77 |
question = gr.Textbox(
|
| 78 |
+
label="Question",
|
| 79 |
+
placeholder="Enter your question here"
|
| 80 |
)
|
| 81 |
answer = gr.Textbox(
|
| 82 |
+
label="Response",
|
| 83 |
+
interactive=False # Make the response field read-only
|
| 84 |
)
|
| 85 |
with gr.Row():
|
| 86 |
+
submitButton = gr.Button(value="Submit", variant="primary")
|
|
|
|
|
|
|
|
|
|
| 87 |
clearButton = gr.ClearButton(
|
| 88 |
+
components=[inputText, question, answer],
|
| 89 |
+
value="Clear",
|
| 90 |
+
variant="secondary"
|
| 91 |
)
|
| 92 |
+
# Define actions for buttons
|
| 93 |
+
submitButton.click(fn=getTextResponse, inputs=[inputText, question], outputs=[answer])
|
| 94 |
+
clearButton.click(fn=clearFunction)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
# User interface for searchable PDF input
|
| 97 |
with gr.Blocks() as searchablePdf:
|
| 98 |
with gr.Row():
|
| 99 |
inputFile = gr.File(
|
| 100 |
+
file_types=[".pdf"], # Restrict file types to PDFs
|
| 101 |
+
file_count="single", # Allow only one PDF file selection
|
| 102 |
+
label="Select PDF"
|
| 103 |
)
|
| 104 |
with gr.Row():
|
| 105 |
+
question = gr.Textbox(label="Question", placeholder="Enter your question here")
|
| 106 |
+
answer = gr.Textbox(label="Response", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
with gr.Row():
|
| 108 |
+
submitButton = gr.Button(value="Submit", variant="primary")
|
|
|
|
|
|
|
|
|
|
| 109 |
clearButton = gr.ClearButton(
|
| 110 |
+
components=[inputFile, question, answer],
|
| 111 |
+
value="Clear",
|
| 112 |
+
variant="secondary"
|
| 113 |
)
|
| 114 |
+
# Define actions for buttons
|
| 115 |
+
submitButton.click(fn=getSearchablePdfResponse, inputs=[inputFile, question], outputs=[answer])
|
| 116 |
+
clearButton.click(fn=clearFunction)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
+
# User interface for scannable PDF input
|
| 119 |
with gr.Blocks() as scannablePdf:
|
| 120 |
with gr.Row():
|
| 121 |
+
inputFile = gr.File(file_types=[".pdf"], file_count="single", label="Select PDF")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
with gr.Row():
|
| 123 |
+
question = gr.Textbox(label="Question", placeholder="Enter your question here")
|
| 124 |
+
answer = gr.Textbox(label="Response", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
with gr.Row():
|
| 126 |
+
submitButton = gr.Button(value="Submit", variant="primary")
|
|
|
|
|
|
|
|
|
|
| 127 |
clearButton = gr.ClearButton(
|
| 128 |
+
components=[inputFile, question, answer],
|
| 129 |
+
value="Clear",
|
| 130 |
+
variant="secondary"
|
| 131 |
+
)
|
| 132 |
+
# Define actions for buttons
|
| 133 |
+
submitButton.click(fn=getScannablePdfResponse, inputs=[inputFile, question], outputs=[answer])
|
| 134 |
+
clearButton.click(fn=clearFunction)
|
| 135 |
+
|
| 136 |
+
def getLinksButtonFn(baseUrl: str) -> tuple:
|
| 137 |
+
"""
|
| 138 |
+
Fetch links from the specified base URL.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
baseUrl (str): The base URL from which to fetch links.
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
tuple: A tuple containing a CheckboxGroup of fetched links and two rows for the UI.
|
| 145 |
+
"""
|
| 146 |
+
links = pipeline.webCrawler.getLinks(url=baseUrl) # Fetch links using the web crawler
|
| 147 |
+
checkboxes = gr.CheckboxGroup(choices=links, label="Fetched Links", visible=True)
|
| 148 |
+
row2 = gr.Row(visible=True)
|
| 149 |
+
row3 = gr.Row(visible=True)
|
| 150 |
+
return checkboxes, row2, row3
|
| 151 |
|
| 152 |
+
@spaces.GPU
|
| 153 |
+
def getWebsiteResponse(links: list[str], inputQuery: str) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Generate a response based on fetched website links and a query.
|
| 156 |
|
| 157 |
+
Args:
|
| 158 |
+
links (list[str]): List of links to process.
|
| 159 |
+
inputQuery (str): The question to be answered.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
Returns:
|
| 162 |
+
str: The response generated from the website links.
|
| 163 |
+
"""
|
| 164 |
global chain
|
| 165 |
if chain is None:
|
| 166 |
+
chain = pipeline.webCrawl(urls=links) # Create a new processing chain for web crawling
|
| 167 |
+
response = chain.invoke({"question": inputQuery})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
return response
|
| 169 |
|
| 170 |
+
def clearWebsiteResponse() -> gr.CheckboxGroup:
|
| 171 |
+
"""Clear the website response and reset the checkboxes."""
|
| 172 |
global chain
|
| 173 |
+
chain = None # Reset the chain
|
| 174 |
+
checkboxes = gr.CheckboxGroup(choices=[], label="Fetched Links", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
return checkboxes
|
| 176 |
|
| 177 |
+
# User interface for website crawling
|
| 178 |
with gr.Blocks() as websiteCrawler:
|
| 179 |
with gr.Row():
|
| 180 |
inputUrl = gr.Textbox(
|
| 181 |
+
label="Base URL",
|
| 182 |
+
placeholder="Enter the Base URL to fetch other links",
|
| 183 |
+
scale=3
|
| 184 |
+
)
|
| 185 |
+
getLinksButton = gr.Button(value="Get Links", variant="primary", scale=1)
|
| 186 |
+
checkboxes = gr.CheckboxGroup(choices=[], label="Fetched Links")
|
| 187 |
+
with gr.Row(visible=False) as row2:
|
| 188 |
+
question = gr.Textbox(label="Question", placeholder="Enter your question here")
|
| 189 |
+
answer = gr.Textbox(label="Response", interactive=False)
|
| 190 |
+
with gr.Row(visible=False) as row3:
|
| 191 |
+
submitButton = gr.Button(value="Submit", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
clearButton = gr.ClearButton(
|
| 193 |
+
components=[question, answer],
|
| 194 |
+
value="Clear",
|
| 195 |
+
variant="secondary"
|
| 196 |
)
|
| 197 |
+
# Define actions for buttons
|
| 198 |
+
getLinksButton.click(fn=getLinksButtonFn, inputs=[inputUrl], outputs=[checkboxes, row2, row3])
|
| 199 |
+
submitButton.click(fn=getWebsiteResponse, inputs=[checkboxes, question], outputs=[answer])
|
| 200 |
+
clearButton.click(fn=clearWebsiteResponse, inputs=None, outputs=[checkboxes])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
@spaces.GPU
|
| 203 |
+
def getYoutubeResponse(links: str, inputQuery: str) -> str:
|
| 204 |
+
"""
|
| 205 |
+
Generate a response based on YouTube video links and a query.
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
links (str): Comma-separated YouTube video links.
|
| 209 |
+
inputQuery (str): The question to be answered.
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
str: The response generated from the YouTube videos.
|
| 213 |
+
"""
|
| 214 |
global chain
|
| 215 |
+
links = [link.strip() for link in links.split(",")] # Split and clean the links
|
| 216 |
if chain is None:
|
| 217 |
+
chain = pipeline.youtubeLinks(urls=links) # Create a new processing chain for YouTube links
|
| 218 |
+
response = chain.invoke({"question": inputQuery})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
return response
|
| 220 |
|
| 221 |
+
# User interface for YouTube links
|
| 222 |
with gr.Blocks() as youtubeInterface:
|
| 223 |
with gr.Row():
|
| 224 |
inputLinks = gr.Textbox(
|
| 225 |
+
label="Youtube Links",
|
| 226 |
+
placeholder='Enter comma(,)-separated youtube video links'
|
| 227 |
)
|
| 228 |
with gr.Row():
|
| 229 |
+
question = gr.Textbox(label="Question", placeholder="Enter your question here")
|
| 230 |
+
answer = gr.Textbox(label="Response", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
with gr.Row():
|
| 232 |
+
submitButton = gr.Button(value="Submit", variant="primary")
|
|
|
|
|
|
|
|
|
|
| 233 |
clearButton = gr.ClearButton(
|
| 234 |
+
components=[inputLinks, question, answer],
|
| 235 |
+
value="Clear",
|
| 236 |
+
variant="secondary"
|
| 237 |
)
|
| 238 |
+
# Define actions for buttons
|
| 239 |
+
submitButton.click(fn=getYoutubeResponse, inputs=[inputLinks, question], outputs=[answer])
|
| 240 |
+
clearButton.click(fn=clearFunction)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
# Create a tabbed interface for the different functionalities
|
| 243 |
application = gr.TabbedInterface(
|
| 244 |
[textInterface, searchablePdf, scannablePdf, websiteCrawler, youtubeInterface],
|
| 245 |
["Text", "Searchable PDF", "Scannable PDF", "Website Text", "Youtube Transcripts"]
|
| 246 |
)
|
| 247 |
|
| 248 |
+
# Launch the Gradio application
|
| 249 |
application.launch()
|
requirements.txt
CHANGED
|
@@ -1,151 +1,17 @@
|
|
| 1 |
-
aiofiles==23.2.1
|
| 2 |
-
aiohappyeyeballs==2.4.0
|
| 3 |
-
aiohttp==3.10.6
|
| 4 |
-
aiosignal==1.3.1
|
| 5 |
-
annotated-types==0.7.0
|
| 6 |
-
anyio==4.6.0
|
| 7 |
-
asttokens==2.4.1
|
| 8 |
-
async-timeout==4.0.3
|
| 9 |
-
attrs==24.2.0
|
| 10 |
-
Authlib==1.3.2
|
| 11 |
beautifulsoup4==4.12.3
|
| 12 |
-
certifi==2024.8.30
|
| 13 |
-
cffi==1.17.1
|
| 14 |
-
charset-normalizer==3.3.2
|
| 15 |
-
click==8.0.4
|
| 16 |
-
cryptography==43.0.1
|
| 17 |
-
dataclasses-json==0.6.7
|
| 18 |
-
datasets==3.0.0
|
| 19 |
-
decorator==5.1.1
|
| 20 |
-
dill==0.3.8
|
| 21 |
-
distro==1.9.0
|
| 22 |
easyocr==1.7.2
|
| 23 |
-
exceptiongroup==1.2.2
|
| 24 |
-
executing==2.1.0
|
| 25 |
-
fastapi==0.115.0
|
| 26 |
-
ffmpy==0.4.0
|
| 27 |
-
filelock==3.16.1
|
| 28 |
-
frozenlist==1.4.1
|
| 29 |
-
fsspec==2024.6.1
|
| 30 |
gradio==5.0.2
|
| 31 |
-
gradio_client==1.4.0
|
| 32 |
-
greenlet==3.1.1
|
| 33 |
-
groq==0.11.0
|
| 34 |
-
h11==0.14.0
|
| 35 |
-
hf_transfer==0.1.8
|
| 36 |
-
httpcore==1.0.6
|
| 37 |
-
httpx==0.27.2
|
| 38 |
-
huggingface-hub==0.25.1
|
| 39 |
-
idna==3.10
|
| 40 |
-
imageio==2.35.1
|
| 41 |
-
ipython==8.28.0
|
| 42 |
-
itsdangerous==2.2.0
|
| 43 |
-
jedi==0.19.1
|
| 44 |
-
Jinja2==3.1.4
|
| 45 |
-
joblib==1.4.2
|
| 46 |
-
jsonpatch==1.33
|
| 47 |
-
jsonpointer==3.0.0
|
| 48 |
langchain==0.3.3
|
| 49 |
langchain-community==0.3.2
|
| 50 |
langchain-core==0.3.10
|
| 51 |
langchain-groq==0.2.0
|
| 52 |
langchain-huggingface==0.1.0
|
| 53 |
langchain-text-splitters==0.3.0
|
| 54 |
-
langsmith==0.1.134
|
| 55 |
-
lazy_loader==0.4
|
| 56 |
-
markdown-it-py==3.0.0
|
| 57 |
-
MarkupSafe==2.1.5
|
| 58 |
-
marshmallow==3.22.0
|
| 59 |
-
matplotlib-inline==0.1.7
|
| 60 |
-
mdurl==0.1.2
|
| 61 |
-
mpmath==1.3.0
|
| 62 |
-
multidict==6.1.0
|
| 63 |
-
multiprocess==0.70.16
|
| 64 |
-
mypy-extensions==1.0.0
|
| 65 |
-
networkx==3.3
|
| 66 |
-
ninja==1.11.1.1
|
| 67 |
numpy==1.26.4
|
| 68 |
-
nvidia-cublas-cu12==12.1.3.1
|
| 69 |
-
nvidia-cuda-cupti-cu12==12.1.105
|
| 70 |
-
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 71 |
-
nvidia-cuda-runtime-cu12==12.1.105
|
| 72 |
-
nvidia-cudnn-cu12==9.1.0.70
|
| 73 |
-
nvidia-cufft-cu12==11.0.2.54
|
| 74 |
-
nvidia-curand-cu12==10.3.2.106
|
| 75 |
-
nvidia-cusolver-cu12==11.4.5.107
|
| 76 |
-
nvidia-cusparse-cu12==12.1.0.106
|
| 77 |
-
nvidia-nccl-cu12==2.20.5
|
| 78 |
-
nvidia-nvjitlink-cu12==12.6.68
|
| 79 |
-
nvidia-nvtx-cu12==12.1.105
|
| 80 |
-
opencv-python-headless==4.10.0.84
|
| 81 |
-
orjson==3.10.7
|
| 82 |
-
packaging==24.1
|
| 83 |
-
pandas==2.2.3
|
| 84 |
-
parso==0.8.4
|
| 85 |
pdf2image==1.17.0
|
| 86 |
-
pexpect==4.9.0
|
| 87 |
-
pillow==10.4.0
|
| 88 |
-
prompt_toolkit==3.0.48
|
| 89 |
-
protobuf==3.20.3
|
| 90 |
-
psutil==5.9.8
|
| 91 |
-
ptyprocess==0.7.0
|
| 92 |
-
pure_eval==0.2.3
|
| 93 |
-
pyarrow==17.0.0
|
| 94 |
-
pyclipper==1.3.0.post5
|
| 95 |
-
pycparser==2.22
|
| 96 |
-
pydantic==2.9.2
|
| 97 |
-
pydantic-settings==2.5.2
|
| 98 |
-
pydantic_core==2.23.4
|
| 99 |
-
pydub==0.25.1
|
| 100 |
-
Pygments==2.18.0
|
| 101 |
PyMuPDF==1.24.11
|
| 102 |
-
python-bidi==0.6.0
|
| 103 |
-
python-dateutil==2.9.0.post0
|
| 104 |
python-dotenv==1.0.1
|
| 105 |
-
python-multipart==0.0.12
|
| 106 |
-
pytz==2024.2
|
| 107 |
-
PyYAML==6.0.2
|
| 108 |
-
regex==2024.9.11
|
| 109 |
requests==2.32.3
|
| 110 |
-
requests-toolbelt==1.0.0
|
| 111 |
-
rich==13.9.2
|
| 112 |
-
ruff==0.6.9
|
| 113 |
-
safetensors==0.4.5
|
| 114 |
-
scikit-image==0.24.0
|
| 115 |
-
scikit-learn==1.5.2
|
| 116 |
-
scipy==1.14.1
|
| 117 |
-
semantic-version==2.10.0
|
| 118 |
-
sentence-transformers==3.2.0
|
| 119 |
-
shapely==2.0.6
|
| 120 |
-
shellingham==1.5.4
|
| 121 |
-
six==1.16.0
|
| 122 |
-
sniffio==1.3.1
|
| 123 |
-
soupsieve==2.6
|
| 124 |
-
spaces==0.30.3
|
| 125 |
-
SQLAlchemy==2.0.35
|
| 126 |
-
stack-data==0.6.3
|
| 127 |
-
starlette==0.38.6
|
| 128 |
-
sympy==1.13.3
|
| 129 |
-
tenacity==8.5.0
|
| 130 |
-
threadpoolctl==3.5.0
|
| 131 |
-
tifffile==2024.9.20
|
| 132 |
-
tokenizers==0.20.1
|
| 133 |
-
tomlkit==0.12.0
|
| 134 |
-
torch==2.4.1
|
| 135 |
-
torchvision==0.19.1
|
| 136 |
-
tqdm==4.66.5
|
| 137 |
-
traitlets==5.14.3
|
| 138 |
-
transformers==4.45.2
|
| 139 |
-
triton==3.0.0
|
| 140 |
-
typer==0.12.5
|
| 141 |
-
typing-inspect==0.9.0
|
| 142 |
-
typing_extensions==4.12.2
|
| 143 |
-
tzdata==2024.2
|
| 144 |
urllib3==2.2.3
|
| 145 |
-
uvicorn==0.31.1
|
| 146 |
-
wcwidth==0.2.13
|
| 147 |
-
websockets==12.0
|
| 148 |
-
xxhash==3.5.0
|
| 149 |
-
yarl==1.12.1
|
| 150 |
youtube-transcript-api==0.6.2
|
| 151 |
-e .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
beautifulsoup4==4.12.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
easyocr==1.7.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
gradio==5.0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
langchain==0.3.3
|
| 5 |
langchain-community==0.3.2
|
| 6 |
langchain-core==0.3.10
|
| 7 |
langchain-groq==0.2.0
|
| 8 |
langchain-huggingface==0.1.0
|
| 9 |
langchain-text-splitters==0.3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
numpy==1.26.4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
pdf2image==1.17.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
PyMuPDF==1.24.11
|
|
|
|
|
|
|
| 13 |
python-dotenv==1.0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
requests==2.32.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
urllib3==2.2.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
youtube-transcript-api==0.6.2
|
| 17 |
-e .
|
setup.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from setuptools import setup, find_packages
|
| 2 |
|
| 3 |
HYPEN_E_DOT = "-e ."
|
|
|
|
| 4 |
def getRequirements(requirementsPath: str) -> list[str]:
|
| 5 |
with open(requirementsPath) as file:
|
| 6 |
requirements = file.read().split("\n")
|
|
@@ -8,10 +9,11 @@ def getRequirements(requirementsPath: str) -> list[str]:
|
|
| 8 |
return requirements
|
| 9 |
|
| 10 |
setup(
|
| 11 |
-
name
|
| 12 |
-
author
|
| 13 |
-
author_email
|
| 14 |
-
version
|
| 15 |
-
packages
|
| 16 |
-
install_requires
|
|
|
|
| 17 |
)
|
|
|
|
| 1 |
from setuptools import setup, find_packages
|
| 2 |
|
| 3 |
HYPEN_E_DOT = "-e ."
|
| 4 |
+
|
| 5 |
def getRequirements(requirementsPath: str) -> list[str]:
|
| 6 |
with open(requirementsPath) as file:
|
| 7 |
requirements = file.read().split("\n")
|
|
|
|
| 9 |
return requirements
|
| 10 |
|
| 11 |
setup(
|
| 12 |
+
name="ConversAI",
|
| 13 |
+
author="Rauhan Ahmed Siddiqui",
|
| 14 |
+
author_email="rauhaan.siddiqui@gmail.com",
|
| 15 |
+
version="0.1",
|
| 16 |
+
packages=find_packages(),
|
| 17 |
+
install_requires=getRequirements(requirementsPath="requirements.txt"),
|
| 18 |
+
description="ConversAI: An innovative conversational AI framework for intelligent text extraction and querying.",
|
| 19 |
)
|
src/components/loaders/pdfLoader.py
CHANGED
|
@@ -7,17 +7,37 @@ import numpy as np
|
|
| 7 |
import pymupdf
|
| 8 |
import easyocr
|
| 9 |
|
| 10 |
-
|
| 11 |
class PdfLoader:
|
| 12 |
def __init__(self) -> None:
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
logger.info("Text Extraction Started from Searchable PDF")
|
| 22 |
doc = pymupdf.open(pdfPath)
|
| 23 |
pages = [doc.load_page(i) for i in range(len(doc))]
|
|
@@ -27,12 +47,30 @@ class PdfLoader:
|
|
| 27 |
return "\n".join(texts)
|
| 28 |
except Exception as e:
|
| 29 |
logger.error(CustomException(e))
|
| 30 |
-
|
| 31 |
-
def getText(self, image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
|
| 33 |
-
return cleanText(text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
| 36 |
try:
|
| 37 |
logger.info("Text Extraction Started from Scannable PDF")
|
| 38 |
allImages = convert_from_path(pdfPath)
|
|
|
|
| 7 |
import pymupdf
|
| 8 |
import easyocr
|
| 9 |
|
|
|
|
| 10 |
class PdfLoader:
|
| 11 |
def __init__(self) -> None:
|
| 12 |
+
"""
|
| 13 |
+
Initialize the PdfLoader with configuration settings and an EasyOCR reader.
|
| 14 |
+
"""
|
| 15 |
+
self.config = getConfig(path="config.ini")
|
| 16 |
+
self.reader = easyocr.Reader(['en'], gpu=self.config.getboolean("EASYOCR", "gpu"))
|
| 17 |
+
|
| 18 |
+
def extractTextFromPage(self, page) -> str:
|
| 19 |
+
"""
|
| 20 |
+
Extract and clean text from a PDF page.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
page: A PyMuPDF page object.
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
str: Cleaned text extracted from the page.
|
| 27 |
+
"""
|
| 28 |
+
return cleanText(text=page.get_text())
|
| 29 |
+
|
| 30 |
+
def searchablePdf(self, pdfPath: str) -> str:
|
| 31 |
+
"""
|
| 32 |
+
Extract text from a searchable PDF.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
pdfPath (str): The file path to the searchable PDF.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
str: All extracted text from the PDF.
|
| 39 |
+
"""
|
| 40 |
+
try:
|
| 41 |
logger.info("Text Extraction Started from Searchable PDF")
|
| 42 |
doc = pymupdf.open(pdfPath)
|
| 43 |
pages = [doc.load_page(i) for i in range(len(doc))]
|
|
|
|
| 47 |
return "\n".join(texts)
|
| 48 |
except Exception as e:
|
| 49 |
logger.error(CustomException(e))
|
| 50 |
+
|
| 51 |
+
def getText(self, image) -> str:
|
| 52 |
+
"""
|
| 53 |
+
Extract and clean text from an image using EasyOCR.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
image: An image (numpy array).
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
str: Cleaned text extracted from the image.
|
| 60 |
+
"""
|
| 61 |
text = "\n".join([text[1] for text in self.reader.readtext(np.array(image), paragraph=True)])
|
| 62 |
+
return cleanText(text=text)
|
| 63 |
+
|
| 64 |
+
def scannablePdf(self, pdfPath: str) -> str:
|
| 65 |
+
"""
|
| 66 |
+
Extract text from a scannable PDF using OCR.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
pdfPath (str): The file path to the scannable PDF.
|
| 70 |
|
| 71 |
+
Returns:
|
| 72 |
+
str: All extracted text from the PDF.
|
| 73 |
+
"""
|
| 74 |
try:
|
| 75 |
logger.info("Text Extraction Started from Scannable PDF")
|
| 76 |
allImages = convert_from_path(pdfPath)
|
src/components/loaders/websiteCrawler.py
CHANGED
|
@@ -1,65 +1,100 @@
|
|
| 1 |
from concurrent.futures import ThreadPoolExecutor
|
| 2 |
from src.utils.exceptions import CustomException
|
| 3 |
from urllib.parse import urlparse, urljoin
|
| 4 |
-
from src.utils.functions import getConfig
|
| 5 |
-
from src.utils.functions import cleanText
|
| 6 |
from src.utils.logging import logger
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
import time
|
| 9 |
import requests
|
| 10 |
|
| 11 |
-
|
| 12 |
class WebsiteCrawler:
|
| 13 |
def __init__(self):
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
response = requests.get(url)
|
| 18 |
soup = BeautifulSoup(response.content, "html.parser")
|
| 19 |
anchors = soup.find_all("a")
|
| 20 |
links = []
|
|
|
|
| 21 |
for anchor in anchors:
|
| 22 |
if "href" in anchor.attrs:
|
| 23 |
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
|
| 24 |
links.append(anchor.attrs["href"])
|
| 25 |
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
|
| 26 |
links.append(urljoin(url + "/", anchor.attrs["href"]))
|
| 27 |
-
|
| 28 |
-
pass
|
| 29 |
links = [link for link in links if "#" not in link]
|
| 30 |
links = list(set(links))
|
| 31 |
-
|
| 32 |
-
continue
|
| 33 |
return links
|
| 34 |
-
|
| 35 |
-
def getLinks(self, url: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
try:
|
| 37 |
-
logger.info("
|
| 38 |
start = time.time()
|
| 39 |
links = self.getLinksFromPage(url)
|
| 40 |
uniqueLinks = set()
|
|
|
|
| 41 |
for link in links:
|
| 42 |
now = time.time()
|
| 43 |
if now - start > self.config.getint("WEBCRAWLER", "timeout"):
|
| 44 |
break
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
return list(set([x[:
|
| 48 |
except Exception as e:
|
| 49 |
logger.error(CustomException(e))
|
| 50 |
|
| 51 |
-
def extractTextFromUrl(self, url: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
response = requests.get(url)
|
| 53 |
response.raise_for_status()
|
| 54 |
html = response.text
|
| 55 |
soup = BeautifulSoup(html, 'html.parser')
|
| 56 |
-
return cleanText(text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
try:
|
| 60 |
-
logger.info("
|
| 61 |
with ThreadPoolExecutor() as executor:
|
| 62 |
texts = list(executor.map(self.extractTextFromUrl, urls))
|
| 63 |
-
return "\n".join(texts)
|
| 64 |
except Exception as e:
|
| 65 |
logger.error(CustomException(e))
|
|
|
|
| 1 |
from concurrent.futures import ThreadPoolExecutor
|
| 2 |
from src.utils.exceptions import CustomException
|
| 3 |
from urllib.parse import urlparse, urljoin
|
| 4 |
+
from src.utils.functions import getConfig, cleanText
|
|
|
|
| 5 |
from src.utils.logging import logger
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
import time
|
| 8 |
import requests
|
| 9 |
|
|
|
|
| 10 |
class WebsiteCrawler:
|
| 11 |
def __init__(self):
|
| 12 |
+
"""Initialize the WebsiteCrawler with configuration settings."""
|
| 13 |
+
self.config = getConfig(path="config.ini")
|
| 14 |
+
|
| 15 |
+
def getLinksFromPage(self, url: str) -> list[str]:
|
| 16 |
+
"""
|
| 17 |
+
Extract all valid links from a given webpage.
|
| 18 |
|
| 19 |
+
Args:
|
| 20 |
+
url (str): The URL of the webpage to extract links from.
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
list[str]: A list of extracted links from the page.
|
| 24 |
+
"""
|
| 25 |
response = requests.get(url)
|
| 26 |
soup = BeautifulSoup(response.content, "html.parser")
|
| 27 |
anchors = soup.find_all("a")
|
| 28 |
links = []
|
| 29 |
+
|
| 30 |
for anchor in anchors:
|
| 31 |
if "href" in anchor.attrs:
|
| 32 |
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
|
| 33 |
links.append(anchor.attrs["href"])
|
| 34 |
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
|
| 35 |
links.append(urljoin(url + "/", anchor.attrs["href"]))
|
| 36 |
+
|
|
|
|
| 37 |
links = [link for link in links if "#" not in link]
|
| 38 |
links = list(set(links))
|
| 39 |
+
|
|
|
|
| 40 |
return links
|
| 41 |
+
|
| 42 |
+
def getLinks(self, url: str) -> list[str]:
|
| 43 |
+
"""
|
| 44 |
+
Fetch and return all unique links found from the given URL.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
url (str): The starting URL to fetch links from.
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
list[str]: A list of unique links found.
|
| 51 |
+
"""
|
| 52 |
try:
|
| 53 |
+
logger.info("Fetching links from URL")
|
| 54 |
start = time.time()
|
| 55 |
links = self.getLinksFromPage(url)
|
| 56 |
uniqueLinks = set()
|
| 57 |
+
|
| 58 |
for link in links:
|
| 59 |
now = time.time()
|
| 60 |
if now - start > self.config.getint("WEBCRAWLER", "timeout"):
|
| 61 |
break
|
| 62 |
+
uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
|
| 63 |
+
|
| 64 |
+
return list(set([x[:-1] if x[-1] == "/" else x for x in uniqueLinks]))
|
| 65 |
except Exception as e:
|
| 66 |
logger.error(CustomException(e))
|
| 67 |
|
| 68 |
+
def extractTextFromUrl(self, url: str) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Extract and clean text content from a given URL.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
url (str): The URL of the webpage to extract text from.
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
str: Cleaned text extracted from the webpage.
|
| 77 |
+
"""
|
| 78 |
response = requests.get(url)
|
| 79 |
response.raise_for_status()
|
| 80 |
html = response.text
|
| 81 |
soup = BeautifulSoup(html, 'html.parser')
|
| 82 |
+
return cleanText(text=soup.get_text(separator=' ', strip=True))
|
| 83 |
+
|
| 84 |
+
def extractTextFromUrlList(self, urls: list[str]) -> str:
|
| 85 |
+
"""
|
| 86 |
+
Extract text from a list of URLs concurrently.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
urls (list[str]): A list of URLs to extract text from.
|
| 90 |
|
| 91 |
+
Returns:
|
| 92 |
+
str: All extracted text combined into a single string.
|
| 93 |
+
"""
|
| 94 |
try:
|
| 95 |
+
logger.info("Extracting text from URLs")
|
| 96 |
with ThreadPoolExecutor() as executor:
|
| 97 |
texts = list(executor.map(self.extractTextFromUrl, urls))
|
| 98 |
+
return "\n".join(texts)
|
| 99 |
except Exception as e:
|
| 100 |
logger.error(CustomException(e))
|
src/components/loaders/youtubeLoader.py
CHANGED
|
@@ -3,22 +3,29 @@ from src.utils.exceptions import CustomException
|
|
| 3 |
from src.utils.functions import cleanText
|
| 4 |
from src.utils.logging import logger
|
| 5 |
|
| 6 |
-
|
| 7 |
class YoutubeTranscriptLoader:
|
| 8 |
def __init__(self):
|
|
|
|
| 9 |
pass
|
| 10 |
|
| 11 |
-
def getTranscripts(self, urls: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
texts = []
|
| 13 |
for url in set(urls):
|
| 14 |
try:
|
| 15 |
-
loader = YoutubeLoader.from_youtube_url(
|
| 16 |
-
url, add_video_info=False
|
| 17 |
-
)
|
| 18 |
doc = " ".join([x.page_content for x in loader.load()])
|
| 19 |
-
texts.append(cleanText(text
|
| 20 |
except Exception as e:
|
| 21 |
logger.error(CustomException(e))
|
| 22 |
-
|
| 23 |
-
|
| 24 |
return "\n".join(texts)
|
|
|
|
| 3 |
from src.utils.functions import cleanText
|
| 4 |
from src.utils.logging import logger
|
| 5 |
|
|
|
|
| 6 |
class YoutubeTranscriptLoader:
|
| 7 |
def __init__(self):
|
| 8 |
+
"""Initialize the YoutubeTranscriptLoader."""
|
| 9 |
pass
|
| 10 |
|
| 11 |
+
def getTranscripts(self, urls: str) -> str:
|
| 12 |
+
"""
|
| 13 |
+
Retrieve transcripts from a list of YouTube URLs.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
urls (str): Comma-separated YouTube URLs to fetch transcripts from.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
str: Combined transcripts cleaned and joined by newlines.
|
| 20 |
+
"""
|
| 21 |
texts = []
|
| 22 |
for url in set(urls):
|
| 23 |
try:
|
| 24 |
+
loader = YoutubeLoader.from_youtube_url(url, add_video_info=False)
|
|
|
|
|
|
|
| 25 |
doc = " ".join([x.page_content for x in loader.load()])
|
| 26 |
+
texts.append(cleanText(text=doc))
|
| 27 |
except Exception as e:
|
| 28 |
logger.error(CustomException(e))
|
| 29 |
+
texts.append("") # Append an empty string on error
|
| 30 |
+
|
| 31 |
return "\n".join(texts)
|
src/components/rag/RAG.py
CHANGED
|
@@ -3,39 +3,52 @@ from langchain_core.output_parsers import StrOutputParser
|
|
| 3 |
from langchain_core.prompts import ChatPromptTemplate
|
| 4 |
from langchain_core.runnables import RunnableLambda
|
| 5 |
from src.utils.exceptions import CustomException
|
| 6 |
-
from src.utils.functions import getConfig
|
| 7 |
-
from src.utils.functions import loadYaml
|
| 8 |
from src.utils.logging import logger
|
| 9 |
from langchain_groq import ChatGroq
|
| 10 |
|
| 11 |
-
|
| 12 |
class Chain:
|
| 13 |
def __init__(self):
|
| 14 |
-
|
|
|
|
| 15 |
self.store = VectorStore()
|
| 16 |
-
prompt = loadYaml(path
|
| 17 |
self.prompt = ChatPromptTemplate.from_template(prompt)
|
| 18 |
|
| 19 |
-
def formatDocs(self, docs):
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
return context
|
| 28 |
|
| 29 |
def returnChain(self, text: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
try:
|
| 31 |
-
logger.info("
|
| 32 |
-
store = self.store.setupStore(text
|
| 33 |
chain = (
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
)
|
| 40 |
return chain
|
| 41 |
except Exception as e:
|
|
|
|
| 3 |
from langchain_core.prompts import ChatPromptTemplate
|
| 4 |
from langchain_core.runnables import RunnableLambda
|
| 5 |
from src.utils.exceptions import CustomException
|
| 6 |
+
from src.utils.functions import getConfig, loadYaml
|
|
|
|
| 7 |
from src.utils.logging import logger
|
| 8 |
from langchain_groq import ChatGroq
|
| 9 |
|
|
|
|
| 10 |
class Chain:
|
| 11 |
def __init__(self):
|
| 12 |
+
"""Initialize the Chain with configuration and prompt template."""
|
| 13 |
+
self.config = getConfig(path="config.ini")
|
| 14 |
self.store = VectorStore()
|
| 15 |
+
prompt = loadYaml(path="params.yaml")["prompt"]
|
| 16 |
self.prompt = ChatPromptTemplate.from_template(prompt)
|
| 17 |
|
| 18 |
+
def formatDocs(self, docs) -> str:
|
| 19 |
+
"""
|
| 20 |
+
Format a list of documents into a single string.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
docs: A list of documents to format.
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
str: Formatted string with documents or a placeholder if empty.
|
| 27 |
+
"""
|
| 28 |
+
context = "\n\n\n".join(docs) or "No Context Found"
|
| 29 |
return context
|
| 30 |
|
| 31 |
def returnChain(self, text: str):
|
| 32 |
+
"""
|
| 33 |
+
Create and return a processing chain based on the input text.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
text (str): Input text to prepare the chain.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Chain: Configured chain for processing input.
|
| 40 |
+
"""
|
| 41 |
try:
|
| 42 |
+
logger.info("Preparing chain")
|
| 43 |
+
store = self.store.setupStore(text=text)
|
| 44 |
chain = (
|
| 45 |
+
{"context": RunnableLambda(lambda x: x["question"]) | store | RunnableLambda(self.formatDocs),
|
| 46 |
+
"question": RunnableLambda(lambda x: x["question"])}
|
| 47 |
+
| self.prompt
|
| 48 |
+
| ChatGroq(model_name=self.config.get("LLM", "llmModel"),
|
| 49 |
+
temperature=self.config.getfloat("LLM", "temperature"),
|
| 50 |
+
max_tokens=self.config.getint("LLM", "maxTokens"))
|
| 51 |
+
| StrOutputParser()
|
| 52 |
)
|
| 53 |
return chain
|
| 54 |
except Exception as e:
|
src/components/vectors/vectorstore.py
CHANGED
|
@@ -8,31 +8,41 @@ from src.utils.logging import logger
|
|
| 8 |
|
| 9 |
class VectorStore:
|
| 10 |
def __init__(self):
|
| 11 |
-
|
|
|
|
| 12 |
self.vectorEmbeddings = HuggingFaceEmbeddings(
|
| 13 |
-
model_name
|
| 14 |
-
model_kwargs
|
| 15 |
-
encode_kwargs
|
| 16 |
)
|
| 17 |
self.splitter = RecursiveCharacterTextSplitter(
|
| 18 |
-
chunk_size
|
| 19 |
-
chunk_overlap
|
| 20 |
-
add_start_index
|
| 21 |
)
|
| 22 |
|
| 23 |
def setupStore(self, text: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
try:
|
| 25 |
store = InMemoryVectorStore(self.vectorEmbeddings)
|
| 26 |
-
textDocument = Document(page_content
|
| 27 |
documents = self.splitter.split_documents([textDocument])
|
| 28 |
-
store.add_documents(documents
|
| 29 |
return store.as_retriever(
|
| 30 |
-
search_type
|
| 31 |
-
search_kwargs
|
| 32 |
"k": self.config.getint("RETRIEVER", "k"),
|
| 33 |
-
"fetch_k": self.config.getint("RETRIEVER", "fetchK")
|
| 34 |
}
|
| 35 |
-
)
|
| 36 |
except Exception as e:
|
| 37 |
-
|
| 38 |
-
|
|
|
|
| 8 |
|
| 9 |
class VectorStore:
|
| 10 |
def __init__(self):
|
| 11 |
+
"""Initialize the VectorStore with configuration, embeddings, and text splitter."""
|
| 12 |
+
self.config = getConfig(path="config.ini")
|
| 13 |
self.vectorEmbeddings = HuggingFaceEmbeddings(
|
| 14 |
+
model_name=self.config.get("EMBEDDINGS", "embeddingModel"),
|
| 15 |
+
model_kwargs={"device": self.config.get("EMBEDDINGS", "device")},
|
| 16 |
+
encode_kwargs={"normalize_embeddings": self.config.getboolean("EMBEDDINGS", "normalize_embeddings")}
|
| 17 |
)
|
| 18 |
self.splitter = RecursiveCharacterTextSplitter(
|
| 19 |
+
chunk_size=self.config.getint("VECTORSTORE", "chunkSize"),
|
| 20 |
+
chunk_overlap=self.config.getint("VECTORSTORE", "chunkOverlap"),
|
| 21 |
+
add_start_index=self.config.getboolean("VECTORSTORE", "addStartIndex")
|
| 22 |
)
|
| 23 |
|
| 24 |
def setupStore(self, text: str):
|
| 25 |
+
"""
|
| 26 |
+
Set up the vector store with the provided text.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
text (str): The text to store and process.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Retriever: A retriever for querying the vector store.
|
| 33 |
+
"""
|
| 34 |
try:
|
| 35 |
store = InMemoryVectorStore(self.vectorEmbeddings)
|
| 36 |
+
textDocument = Document(page_content=text)
|
| 37 |
documents = self.splitter.split_documents([textDocument])
|
| 38 |
+
store.add_documents(documents=documents)
|
| 39 |
return store.as_retriever(
|
| 40 |
+
search_type=self.config.get("RETRIEVER", "searchType"),
|
| 41 |
+
search_kwargs={
|
| 42 |
"k": self.config.getint("RETRIEVER", "k"),
|
| 43 |
+
"fetch_k": self.config.getint("RETRIEVER", "fetchK")
|
| 44 |
}
|
| 45 |
+
)
|
| 46 |
except Exception as e:
|
| 47 |
+
logger.error(CustomException(e))
|
| 48 |
+
print(CustomException(e))
|
src/pipelines/completePipeline.py
CHANGED
|
@@ -8,31 +8,77 @@ load_dotenv("secrets.env")
|
|
| 8 |
|
| 9 |
class Pipeline:
|
| 10 |
def __init__(self):
|
|
|
|
| 11 |
self.pdfLoader = PdfLoader()
|
| 12 |
self.webCrawler = WebsiteCrawler()
|
| 13 |
self.youtubeLoader = YoutubeTranscriptLoader()
|
| 14 |
self.ragChain = Chain()
|
| 15 |
-
|
| 16 |
def plainText(self, text: str):
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
return chain
|
| 19 |
|
| 20 |
def searchablePdf(self, path: str):
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
return chain
|
| 24 |
|
| 25 |
def scannablePdf(self, path: str):
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
return chain
|
| 29 |
|
| 30 |
def webCrawl(self, urls: list[str]):
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return chain
|
| 34 |
|
| 35 |
def youtubeLinks(self, urls: list[str]):
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class Pipeline:
|
| 10 |
def __init__(self):
|
| 11 |
+
"""Initialize the Pipeline with loaders and the RAG chain."""
|
| 12 |
self.pdfLoader = PdfLoader()
|
| 13 |
self.webCrawler = WebsiteCrawler()
|
| 14 |
self.youtubeLoader = YoutubeTranscriptLoader()
|
| 15 |
self.ragChain = Chain()
|
| 16 |
+
|
| 17 |
def plainText(self, text: str):
|
| 18 |
+
"""
|
| 19 |
+
Process plain text through the RAG chain.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
text (str): The input text to process.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Chain: The processed chain for the input text.
|
| 26 |
+
"""
|
| 27 |
+
chain = self.ragChain.returnChain(text=text)
|
| 28 |
return chain
|
| 29 |
|
| 30 |
def searchablePdf(self, path: str):
|
| 31 |
+
"""
|
| 32 |
+
Process a searchable PDF file.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
path (str): The path to the PDF file.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Chain: The processed chain from the extracted text.
|
| 39 |
+
"""
|
| 40 |
+
extractedText = self.pdfLoader.searchablePdf(pdfPath=path)
|
| 41 |
+
chain = self.ragChain.returnChain(text=extractedText)
|
| 42 |
return chain
|
| 43 |
|
| 44 |
def scannablePdf(self, path: str):
|
| 45 |
+
"""
|
| 46 |
+
Process a scannable PDF file.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
path (str): The path to the PDF file.
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
Chain: The processed chain from the extracted text.
|
| 53 |
+
"""
|
| 54 |
+
extractedText = self.pdfLoader.scannablePdf(pdfPath=path)
|
| 55 |
+
chain = self.ragChain.returnChain(text=extractedText)
|
| 56 |
return chain
|
| 57 |
|
| 58 |
def webCrawl(self, urls: list[str]):
|
| 59 |
+
"""
|
| 60 |
+
Crawl the web for text extraction from provided URLs.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
urls (list[str]): A list of URLs to crawl.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
Chain: The processed chain from the extracted text.
|
| 67 |
+
"""
|
| 68 |
+
extractedText = self.webCrawler.extractTextFromUrlList(urls=urls)
|
| 69 |
+
chain = self.ragChain.returnChain(text=extractedText)
|
| 70 |
return chain
|
| 71 |
|
| 72 |
def youtubeLinks(self, urls: list[str]):
|
| 73 |
+
"""
|
| 74 |
+
Extract transcripts from YouTube links.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
urls (list[str]): A list of YouTube video URLs.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Chain: The processed chain from the extracted transcripts.
|
| 81 |
+
"""
|
| 82 |
+
extractedText = self.youtubeLoader.getTranscripts(urls=urls)
|
| 83 |
+
chain = self.ragChain.returnChain(text=extractedText)
|
| 84 |
+
return chain
|
src/utils/exceptions.py
CHANGED
|
@@ -1,6 +1,15 @@
|
|
| 1 |
import sys
|
| 2 |
|
| 3 |
def error_message_detail(error):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
_, _, exc_info = sys.exc_info()
|
| 5 |
filename = exc_info.tb_frame.f_code.co_filename
|
| 6 |
lineno = exc_info.tb_lineno
|
|
@@ -9,8 +18,15 @@ def error_message_detail(error):
|
|
| 9 |
|
| 10 |
class CustomException(Exception):
|
| 11 |
def __init__(self, error_message):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
super().__init__(error_message)
|
| 13 |
self.error_message = error_message_detail(error_message)
|
| 14 |
|
| 15 |
def __str__(self) -> str:
|
|
|
|
| 16 |
return self.error_message
|
|
|
|
| 1 |
import sys
|
| 2 |
|
| 3 |
def error_message_detail(error):
|
| 4 |
+
"""
|
| 5 |
+
Generate a detailed error message.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
error: The error object.
|
| 9 |
+
|
| 10 |
+
Returns:
|
| 11 |
+
str: A formatted error message including line number and filename.
|
| 12 |
+
"""
|
| 13 |
_, _, exc_info = sys.exc_info()
|
| 14 |
filename = exc_info.tb_frame.f_code.co_filename
|
| 15 |
lineno = exc_info.tb_lineno
|
|
|
|
| 18 |
|
| 19 |
class CustomException(Exception):
|
| 20 |
def __init__(self, error_message):
|
| 21 |
+
"""
|
| 22 |
+
Initialize a CustomException with a detailed error message.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
error_message (str): The error message to be logged.
|
| 26 |
+
"""
|
| 27 |
super().__init__(error_message)
|
| 28 |
self.error_message = error_message_detail(error_message)
|
| 29 |
|
| 30 |
def __str__(self) -> str:
|
| 31 |
+
"""Return the detailed error message."""
|
| 32 |
return self.error_message
|
src/utils/functions.py
CHANGED
|
@@ -3,15 +3,42 @@ import string
|
|
| 3 |
import yaml
|
| 4 |
|
| 5 |
def getConfig(path: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
config = configparser.ConfigParser()
|
| 7 |
config.read(path)
|
| 8 |
return config
|
| 9 |
|
| 10 |
def cleanText(text: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
text = text.replace("\n", " ")
|
| 12 |
text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
|
| 13 |
return text
|
| 14 |
|
| 15 |
def loadYaml(path: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
with open(path) as file:
|
| 17 |
return yaml.safe_load(file)
|
|
|
|
| 3 |
import yaml
|
| 4 |
|
| 5 |
def getConfig(path: str):
|
| 6 |
+
"""
|
| 7 |
+
Load configuration from a specified file.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
path (str): The path to the configuration file.
|
| 11 |
+
|
| 12 |
+
Returns:
|
| 13 |
+
ConfigParser: The loaded configuration object.
|
| 14 |
+
"""
|
| 15 |
config = configparser.ConfigParser()
|
| 16 |
config.read(path)
|
| 17 |
return config
|
| 18 |
|
| 19 |
def cleanText(text: str):
|
| 20 |
+
"""
|
| 21 |
+
Clean the input text by removing newline characters and punctuation.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
text (str): The text to be cleaned.
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
str: The cleaned text.
|
| 28 |
+
"""
|
| 29 |
text = text.replace("\n", " ")
|
| 30 |
text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
|
| 31 |
return text
|
| 32 |
|
| 33 |
def loadYaml(path: str):
|
| 34 |
+
"""
|
| 35 |
+
Load YAML content from a specified file.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
path (str): The path to the YAML file.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
dict: The parsed content of the YAML file.
|
| 42 |
+
"""
|
| 43 |
with open(path) as file:
|
| 44 |
return yaml.safe_load(file)
|
src/utils/logging.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
import logging
|
| 2 |
|
|
|
|
| 3 |
logger = logging.getLogger(__name__)
|
| 4 |
logger.setLevel(logging.INFO)
|
| 5 |
|
|
|
|
| 6 |
logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
|
| 7 |
-
logFormatter = logging.Formatter(fmt
|
| 8 |
|
|
|
|
| 9 |
streamHandler = logging.StreamHandler()
|
| 10 |
streamHandler.setFormatter(logFormatter)
|
| 11 |
|
|
|
|
| 12 |
logger.addHandler(streamHandler)
|
|
|
|
| 1 |
import logging
|
| 2 |
|
| 3 |
+
# Set up the logger for the current module
|
| 4 |
logger = logging.getLogger(__name__)
|
| 5 |
logger.setLevel(logging.INFO)
|
| 6 |
|
| 7 |
+
# Define the log format
|
| 8 |
logFormat = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
|
| 9 |
+
logFormatter = logging.Formatter(fmt=logFormat, style="%")
|
| 10 |
|
| 11 |
+
# Set up a stream handler to output logs to the console
|
| 12 |
streamHandler = logging.StreamHandler()
|
| 13 |
streamHandler.setFormatter(logFormatter)
|
| 14 |
|
| 15 |
+
# Add the stream handler to the logger
|
| 16 |
logger.addHandler(streamHandler)
|