Spaces:

dl4ds
/

tutor_dev

Configuration error

App Files Files Community

XThomasBU commited on Aug 1, 2024

Commit

bbd24f7

1 Parent(s): 36cd3f9

mior updates

Browse files

Files changed (2) hide show

code/modules/config/config.yml +1 -1
code/modules/dataloader/data_loader.py +15 -8

code/modules/config/config.yml CHANGED Viewed

@@ -37,7 +37,7 @@ llm_params:
     temperature: 0.7 # float
     repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
     filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
-  pdf_reader: 'llama' # str [llama, pymupdf, gpt]
   stream: False # bool
 chat_logging:

     temperature: 0.7 # float
     repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
     filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
+  pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
   stream: False # bool
 chat_logging:

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -63,7 +63,7 @@ class HTMLReader:
                 href = href.replace("http", "https")
             absolute_url = urljoin(base_url, href)
-            link['href'] = absolute_url
             resp = requests.head(absolute_url)
             if resp.status_code != 200:
@@ -84,6 +84,7 @@ class HTMLReader:
         else:
             return None
 class FileReader:
     def __init__(self, logger, kind):
         self.logger = logger
@@ -95,8 +96,9 @@ class FileReader:
         else:
             self.pdf_reader = PDFReader()
         self.web_reader = HTMLReader()
-        self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
     def extract_text_from_pdf(self, pdf_path):
         text = ""
@@ -374,7 +376,9 @@ class ChunkProcessor:
 class DataLoader:
     def __init__(self, config, logger=None):
-        self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
@@ -392,19 +396,22 @@ if __name__ == "__main__":
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
-    STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
     uploaded_files = [
-        os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
     ]
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
-            ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
             [],
         )
     )
     print(document_names[:5])
     print(len(document_chunks))

                 href = href.replace("http", "https")
             absolute_url = urljoin(base_url, href)
+            link["href"] = absolute_url
             resp = requests.head(absolute_url)
             if resp.status_code != 200:
         else:
             return None
 class FileReader:
     def __init__(self, logger, kind):
         self.logger = logger
         else:
             self.pdf_reader = PDFReader()
         self.web_reader = HTMLReader()
+        self.logger.info(
+            f"Initialized FileReader with {kind} PDF reader and HTML reader"
+        )
     def extract_text_from_pdf(self, pdf_path):
         text = ""
 class DataLoader:
     def __init__(self, config, logger=None):
+        self.file_reader = FileReader(
+            logger=logger, kind=config["llm_params"]["pdf_reader"]
+        )
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
+    STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
     uploaded_files = [
+        os.path.join(STORAGE_DIR, file)
+        for file in os.listdir(STORAGE_DIR)
+        if file != "urls.txt"
     ]
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
+            [
+                "https://dl4ds.github.io/sp2024/static_files/discussion_slides/00_discussion.pdf"
+            ],
             [],
         )
     )
     print(document_names[:5])
     print(len(document_chunks))