Update utils.py
Browse files
utils.py
CHANGED
|
@@ -60,7 +60,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 60 |
from chromadb.errors import InvalidDimensionException
|
| 61 |
import fitz # PyMuPDF
|
| 62 |
import docx
|
| 63 |
-
from huggingface_hub import hf_hub_download
|
| 64 |
#import io
|
| 65 |
#from PIL import Image, ImageDraw, ImageOps, ImageFont
|
| 66 |
#import base64
|
|
@@ -314,12 +314,23 @@ def create_directory_loader(file_type, directory_path):
|
|
| 314 |
|
| 315 |
def load(self):
|
| 316 |
documents = []
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
return documents
|
| 324 |
|
| 325 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
|
@@ -401,6 +412,7 @@ def document_loading_splitting():
|
|
| 401 |
#os.makedirs(download_dir, exist_ok=True)
|
| 402 |
|
| 403 |
# Dateien im Hugging Face Space auflisten
|
|
|
|
| 404 |
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
|
| 405 |
print("hier.....................................")
|
| 406 |
# Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
|
|
@@ -413,6 +425,15 @@ def document_loading_splitting():
|
|
| 413 |
download_file_from_hf(file_name, local_file_path)
|
| 414 |
print("file_name..................."+str(file_name))
|
| 415 |
print("local_file_path..................."+str(local_file_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
| 417 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
| 418 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
|
|
|
| 60 |
from chromadb.errors import InvalidDimensionException
|
| 61 |
import fitz # PyMuPDF
|
| 62 |
import docx
|
| 63 |
+
from huggingface_hub import hf_hub_download, list_repo_files
|
| 64 |
#import io
|
| 65 |
#from PIL import Image, ImageDraw, ImageOps, ImageFont
|
| 66 |
#import base64
|
|
|
|
| 314 |
|
| 315 |
def load(self):
|
| 316 |
documents = []
|
| 317 |
+
for file_path in self.file_list:
|
| 318 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
| 319 |
+
temp_path = temp_file.name
|
| 320 |
+
|
| 321 |
+
# Datei aus dem Hugging Face Space herunterladen
|
| 322 |
+
hf_hub_download(
|
| 323 |
+
repo_id=STORAGE_REPO_ID,
|
| 324 |
+
filename=file_path,
|
| 325 |
+
repo_type="space",
|
| 326 |
+
local_dir=os.path.dirname(temp_path),
|
| 327 |
+
local_dir_use_symlinks=False
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
documents.extend(self.loader_func(temp_path))
|
| 331 |
+
|
| 332 |
+
# Temporäre Datei löschen
|
| 333 |
+
os.unlink(temp_path)
|
| 334 |
return documents
|
| 335 |
|
| 336 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
|
|
|
| 412 |
#os.makedirs(download_dir, exist_ok=True)
|
| 413 |
|
| 414 |
# Dateien im Hugging Face Space auflisten
|
| 415 |
+
"""
|
| 416 |
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
|
| 417 |
print("hier.....................................")
|
| 418 |
# Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
|
|
|
|
| 425 |
download_file_from_hf(file_name, local_file_path)
|
| 426 |
print("file_name..................."+str(file_name))
|
| 427 |
print("local_file_path..................."+str(local_file_path))
|
| 428 |
+
"""
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
# Dateien im Hugging Face Space auflisten
|
| 432 |
+
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space")
|
| 433 |
+
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
|
| 434 |
+
word_files = [f for f in files_in_repo if f.endswith('.docx') and f.startswith("chroma/kkg/word/")]
|
| 435 |
+
|
| 436 |
+
|
| 437 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
| 438 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
| 439 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|