Update utils.py
Browse files
utils.py
CHANGED
|
@@ -329,10 +329,24 @@ def document_loading_splitting():
|
|
| 329 |
# Load YouTube
|
| 330 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
| 331 |
#docs.extend(loader.load())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
################################
|
| 333 |
# Document splitting
|
| 334 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 335 |
-
splits = text_splitter.split_documents(
|
| 336 |
print("Splits...........................")
|
| 337 |
for split in splits:
|
| 338 |
if 'DIVIS' in split.page_content:
|
|
|
|
| 329 |
# Load YouTube
|
| 330 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
| 331 |
#docs.extend(loader.load())
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
# Vorverarbeitung der Dokumente - passend zu dem der Prompts...
|
| 335 |
+
preprocessed_docs = []
|
| 336 |
+
for doc in docs:
|
| 337 |
+
preprocessed_content = preprocess_text(doc.page_content)
|
| 338 |
+
preprocessed_title = preprocess_text(doc.metadata["title"])
|
| 339 |
+
preprocessed_metadata = {
|
| 340 |
+
"title": preprocessed_title,
|
| 341 |
+
"page": doc.metadata["page"],
|
| 342 |
+
"path": doc.metadata["path"]
|
| 343 |
+
}
|
| 344 |
+
preprocessed_docs.append(Document(metadata=preprocessed_metadata, page_content=preprocessed_content))
|
| 345 |
+
|
| 346 |
################################
|
| 347 |
# Document splitting
|
| 348 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 349 |
+
splits = text_splitter.split_documents(preprocessed_docs)
|
| 350 |
print("Splits...........................")
|
| 351 |
for split in splits:
|
| 352 |
if 'DIVIS' in split.page_content:
|