Update utils.py
Browse files
utils.py
CHANGED
|
@@ -186,8 +186,6 @@ modell_rag = DistilBertForQuestionAnswering.from_pretrained(HF_MODELL)
|
|
| 186 |
tokenizer_rag = DistilBertTokenizer.from_pretrained(HF_MODELL)
|
| 187 |
qa_pipeline = pipeline("question-answering", model=modell_rag, tokenizer=tokenizer_rag)
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
| 191 |
HF_MODELL ="EleutherAI/gpt-neo-2.7B"
|
| 192 |
modell_rag = GPTNeoForCausalLM.from_pretrained(HF_MODELL)
|
| 193 |
tokenizer_rag = GPT2Tokenizer.from_pretrained(HF_MODELL)
|
|
@@ -280,7 +278,8 @@ def access_pdf(self, filename):
|
|
| 280 |
)
|
| 281 |
|
| 282 |
return temp_path
|
| 283 |
-
|
|
|
|
| 284 |
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
| 285 |
def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
|
| 286 |
loaders = {
|
|
@@ -290,7 +289,7 @@ def create_custom_loader(file_type, file_list): #create_directory_loader(file_t
|
|
| 290 |
|
| 291 |
|
| 292 |
class CustomLoader:
|
| 293 |
-
|
| 294 |
def __init__(self, directory_path, file_type, loader_func):
|
| 295 |
self.directory_path = directory_path
|
| 296 |
self.file_type = file_type
|
|
@@ -306,35 +305,14 @@ def create_custom_loader(file_type, file_list): #create_directory_loader(file_t
|
|
| 306 |
return documents
|
| 307 |
|
| 308 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
def __init__(self, file_type, file_list, loader_func):
|
| 312 |
-
self.file_type = file_type
|
| 313 |
-
self.file_list = file_list
|
| 314 |
-
self.loader_func = loader_func
|
| 315 |
-
|
| 316 |
-
def load(self):
|
| 317 |
-
documents = []
|
| 318 |
-
for file_path in self.file_list:
|
| 319 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
| 320 |
-
temp_path = temp_file.name
|
| 321 |
-
|
| 322 |
-
# Datei aus dem Hugging Face Space herunterladen
|
| 323 |
-
hf_hub_download(
|
| 324 |
-
repo_id=STORAGE_REPO_ID,
|
| 325 |
-
filename=file_path,
|
| 326 |
-
repo_type="space",
|
| 327 |
-
local_dir=os.path.dirname(temp_path),
|
| 328 |
-
local_dir_use_symlinks=False,
|
| 329 |
-
token=hf_token
|
| 330 |
-
)
|
| 331 |
-
|
| 332 |
-
documents.extend(self.loader_func(temp_path))
|
| 333 |
-
|
| 334 |
-
# Temporäre Datei löschen
|
| 335 |
-
os.unlink(temp_path)
|
| 336 |
-
return documents
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
return CustomLoader(file_type, file_list, loaders[file_type])
|
| 339 |
|
| 340 |
################################################
|
|
@@ -906,6 +884,15 @@ def get_filename(file_pfad):
|
|
| 906 |
return result
|
| 907 |
|
| 908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
#################################################
|
| 910 |
#Klasse mit zuständen - z.B. für interrupt wenn Stop gedrückt...
|
| 911 |
#################################################
|
|
@@ -932,14 +919,35 @@ class Document:
|
|
| 932 |
}
|
| 933 |
|
| 934 |
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 943 |
|
| 944 |
|
| 945 |
|
|
|
|
| 186 |
tokenizer_rag = DistilBertTokenizer.from_pretrained(HF_MODELL)
|
| 187 |
qa_pipeline = pipeline("question-answering", model=modell_rag, tokenizer=tokenizer_rag)
|
| 188 |
|
|
|
|
|
|
|
| 189 |
HF_MODELL ="EleutherAI/gpt-neo-2.7B"
|
| 190 |
modell_rag = GPTNeoForCausalLM.from_pretrained(HF_MODELL)
|
| 191 |
tokenizer_rag = GPT2Tokenizer.from_pretrained(HF_MODELL)
|
|
|
|
| 278 |
)
|
| 279 |
|
| 280 |
return temp_path
|
| 281 |
+
|
| 282 |
+
"""
|
| 283 |
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
| 284 |
def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
|
| 285 |
loaders = {
|
|
|
|
| 289 |
|
| 290 |
|
| 291 |
class CustomLoader:
|
| 292 |
+
|
| 293 |
def __init__(self, directory_path, file_type, loader_func):
|
| 294 |
self.directory_path = directory_path
|
| 295 |
self.file_type = file_type
|
|
|
|
| 305 |
return documents
|
| 306 |
|
| 307 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
| 308 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
+
|
| 311 |
+
def create_custom_loader(file_type, file_list):
|
| 312 |
+
loaders = {
|
| 313 |
+
'.pdf': load_pdf_with_metadata,
|
| 314 |
+
'.docx': load_word_with_metadata,
|
| 315 |
+
}
|
| 316 |
return CustomLoader(file_type, file_list, loaders[file_type])
|
| 317 |
|
| 318 |
################################################
|
|
|
|
| 884 |
return result
|
| 885 |
|
| 886 |
|
| 887 |
+
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
|
| 888 |
+
for stop_word in stop_words:
|
| 889 |
+
if s.endswith(stop_word):
|
| 890 |
+
return True
|
| 891 |
+
for i in range(1, len(stop_word)):
|
| 892 |
+
if s.endswith(stop_word[:i]):
|
| 893 |
+
return True
|
| 894 |
+
return False
|
| 895 |
+
|
| 896 |
#################################################
|
| 897 |
#Klasse mit zuständen - z.B. für interrupt wenn Stop gedrückt...
|
| 898 |
#################################################
|
|
|
|
| 919 |
}
|
| 920 |
|
| 921 |
|
| 922 |
+
##########################################
|
| 923 |
+
#Class für die Directory Loader - um sie anzupassen
|
| 924 |
+
##########################################
|
| 925 |
+
class CustomLoader:
|
| 926 |
+
def __init__(self, file_type, file_list, loader_func):
|
| 927 |
+
self.file_type = file_type
|
| 928 |
+
self.file_list = file_list
|
| 929 |
+
self.loader_func = loader_func
|
| 930 |
+
|
| 931 |
+
def load(self):
|
| 932 |
+
documents = []
|
| 933 |
+
for file_path in self.file_list:
|
| 934 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
| 935 |
+
temp_path = temp_file.name
|
| 936 |
+
|
| 937 |
+
# Datei aus dem Hugging Face Space herunterladen
|
| 938 |
+
hf_hub_download(
|
| 939 |
+
repo_id=STORAGE_REPO_ID,
|
| 940 |
+
filename=file_path,
|
| 941 |
+
repo_type="space",
|
| 942 |
+
local_dir=os.path.dirname(temp_path),
|
| 943 |
+
local_dir_use_symlinks=False,
|
| 944 |
+
token=hf_token
|
| 945 |
+
)
|
| 946 |
+
documents.extend(self.loader_func(temp_path))
|
| 947 |
+
os.unlink(temp_path)
|
| 948 |
+
return documents
|
| 949 |
+
|
| 950 |
+
|
| 951 |
|
| 952 |
|
| 953 |
|