document-parser-rag

Running on Zero

Liam Dyer commited on May 30, 2024

Commit

d6c1ef6

unverified ·

1 Parent(s): 1fc553f

add filenames because of a gradio client bug

Files changed (1) hide show

app.py CHANGED Viewed

@@ -86,7 +86,7 @@ def convert_pandoc(input_file, filename) -> str:
 @spaces.GPU
-def convert(input_file) -> str:
     plain_text_filetypes = [
         ".txt",
         ".csv",
@@ -99,14 +99,14 @@ def convert(input_file) -> str:
         ".jsonc",
     ]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
-    if any(input_file.endswith(ft) for ft in plain_text_filetypes):
         with open(input_file, "r") as f:
             return f.read()
-    if input_file.endswith(".pdf"):
         return convert_pdf(input_file)
-    return convert_pandoc(input_file, input_file)
 def chunk_to_length(text, max_length=512):
@@ -119,11 +119,14 @@ def chunk_to_length(text, max_length=512):
 @spaces.GPU
-def predict(queries, documents, max_characters) -> list[list[str]]:
     queries = queries.split("\n")
-    # Conver the documents to text
-    converted_docs = [convert(doc) for doc in documents]
     # Return if the total length is less than the max characters
     total_doc_lengths = sum([len(doc) for doc in converted_docs])
@@ -193,6 +196,7 @@ gr.Interface(
     inputs=[
         gr.Textbox(label="Queries separated by newline"),
         gr.File(label="Upload File", file_count="multiple"),
         gr.Number(label="Max output characters", value=16384),
     ],
     outputs=[gr.JSON(label="Embedded documents")],

 @spaces.GPU
+def convert(input_file, filename) -> str:
     plain_text_filetypes = [
         ".txt",
         ".csv",
         ".jsonc",
     ]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
+    if any(filename.endswith(ft) for ft in plain_text_filetypes):
         with open(input_file, "r") as f:
             return f.read()
+    if filename.endswith(".pdf"):
         return convert_pdf(input_file)
+    return convert_pandoc(input_file, filename)
 def chunk_to_length(text, max_length=512):
 @spaces.GPU
+def predict(queries, documents, document_filenames, max_characters) -> list[list[str]]:
     queries = queries.split("\n")
+    document_filenames = document_filenames.split("\n")
+    # Convert the documents to text
+    converted_docs = [
+        convert(doc, filename) for doc, filename in zip(documents, document_filenames)
+    ]
     # Return if the total length is less than the max characters
     total_doc_lengths = sum([len(doc) for doc in converted_docs])
     inputs=[
         gr.Textbox(label="Queries separated by newline"),
         gr.File(label="Upload File", file_count="multiple"),
+        gr.Textbox(label="Filenames separated by newline"),
         gr.Number(label="Max output characters", value=16384),
     ],
     outputs=[gr.JSON(label="Embedded documents")],