XThomasBU
commited on
Commit
·
bbd24f7
1
Parent(s):
36cd3f9
mior updates
Browse files
code/modules/config/config.yml
CHANGED
|
@@ -37,7 +37,7 @@ llm_params:
|
|
| 37 |
temperature: 0.7 # float
|
| 38 |
repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
|
| 39 |
filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
|
| 40 |
-
pdf_reader: '
|
| 41 |
stream: False # bool
|
| 42 |
|
| 43 |
chat_logging:
|
|
|
|
| 37 |
temperature: 0.7 # float
|
| 38 |
repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
|
| 39 |
filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
|
| 40 |
+
pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
|
| 41 |
stream: False # bool
|
| 42 |
|
| 43 |
chat_logging:
|
code/modules/dataloader/data_loader.py
CHANGED
|
@@ -63,7 +63,7 @@ class HTMLReader:
|
|
| 63 |
href = href.replace("http", "https")
|
| 64 |
|
| 65 |
absolute_url = urljoin(base_url, href)
|
| 66 |
-
link[
|
| 67 |
|
| 68 |
resp = requests.head(absolute_url)
|
| 69 |
if resp.status_code != 200:
|
|
@@ -84,6 +84,7 @@ class HTMLReader:
|
|
| 84 |
else:
|
| 85 |
return None
|
| 86 |
|
|
|
|
| 87 |
class FileReader:
|
| 88 |
def __init__(self, logger, kind):
|
| 89 |
self.logger = logger
|
|
@@ -95,8 +96,9 @@ class FileReader:
|
|
| 95 |
else:
|
| 96 |
self.pdf_reader = PDFReader()
|
| 97 |
self.web_reader = HTMLReader()
|
| 98 |
-
self.logger.info(
|
| 99 |
-
|
|
|
|
| 100 |
|
| 101 |
def extract_text_from_pdf(self, pdf_path):
|
| 102 |
text = ""
|
|
@@ -374,7 +376,9 @@ class ChunkProcessor:
|
|
| 374 |
|
| 375 |
class DataLoader:
|
| 376 |
def __init__(self, config, logger=None):
|
| 377 |
-
self.file_reader = FileReader(
|
|
|
|
|
|
|
| 378 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
| 379 |
|
| 380 |
def get_chunks(self, uploaded_files, weblinks):
|
|
@@ -392,19 +396,22 @@ if __name__ == "__main__":
|
|
| 392 |
with open("../code/modules/config/config.yml", "r") as f:
|
| 393 |
config = yaml.safe_load(f)
|
| 394 |
|
| 395 |
-
STORAGE_DIR = os.path.join(BASE_DIR, config[
|
| 396 |
uploaded_files = [
|
| 397 |
-
os.path.join(STORAGE_DIR, file)
|
|
|
|
|
|
|
| 398 |
]
|
| 399 |
|
| 400 |
data_loader = DataLoader(config, logger=logger)
|
| 401 |
document_chunks, document_names, documents, document_metadata = (
|
| 402 |
data_loader.get_chunks(
|
| 403 |
-
[
|
|
|
|
|
|
|
| 404 |
[],
|
| 405 |
)
|
| 406 |
)
|
| 407 |
|
| 408 |
print(document_names[:5])
|
| 409 |
print(len(document_chunks))
|
| 410 |
-
|
|
|
|
| 63 |
href = href.replace("http", "https")
|
| 64 |
|
| 65 |
absolute_url = urljoin(base_url, href)
|
| 66 |
+
link["href"] = absolute_url
|
| 67 |
|
| 68 |
resp = requests.head(absolute_url)
|
| 69 |
if resp.status_code != 200:
|
|
|
|
| 84 |
else:
|
| 85 |
return None
|
| 86 |
|
| 87 |
+
|
| 88 |
class FileReader:
|
| 89 |
def __init__(self, logger, kind):
|
| 90 |
self.logger = logger
|
|
|
|
| 96 |
else:
|
| 97 |
self.pdf_reader = PDFReader()
|
| 98 |
self.web_reader = HTMLReader()
|
| 99 |
+
self.logger.info(
|
| 100 |
+
f"Initialized FileReader with {kind} PDF reader and HTML reader"
|
| 101 |
+
)
|
| 102 |
|
| 103 |
def extract_text_from_pdf(self, pdf_path):
|
| 104 |
text = ""
|
|
|
|
| 376 |
|
| 377 |
class DataLoader:
|
| 378 |
def __init__(self, config, logger=None):
|
| 379 |
+
self.file_reader = FileReader(
|
| 380 |
+
logger=logger, kind=config["llm_params"]["pdf_reader"]
|
| 381 |
+
)
|
| 382 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
| 383 |
|
| 384 |
def get_chunks(self, uploaded_files, weblinks):
|
|
|
|
| 396 |
with open("../code/modules/config/config.yml", "r") as f:
|
| 397 |
config = yaml.safe_load(f)
|
| 398 |
|
| 399 |
+
STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
|
| 400 |
uploaded_files = [
|
| 401 |
+
os.path.join(STORAGE_DIR, file)
|
| 402 |
+
for file in os.listdir(STORAGE_DIR)
|
| 403 |
+
if file != "urls.txt"
|
| 404 |
]
|
| 405 |
|
| 406 |
data_loader = DataLoader(config, logger=logger)
|
| 407 |
document_chunks, document_names, documents, document_metadata = (
|
| 408 |
data_loader.get_chunks(
|
| 409 |
+
[
|
| 410 |
+
"https://dl4ds.github.io/sp2024/static_files/discussion_slides/00_discussion.pdf"
|
| 411 |
+
],
|
| 412 |
[],
|
| 413 |
)
|
| 414 |
)
|
| 415 |
|
| 416 |
print(document_names[:5])
|
| 417 |
print(len(document_chunks))
|
|
|