document-parser-rag

Running on Zero

Liam Dyer commited on May 23, 2024

Commit

9a1c39c

unverified ·

1 Parent(s): 7ca6619

feat: support filename input

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import spaces
 import subprocess
 import os
 import string
 import random
 from pypdf import PdfReader
@@ -57,7 +58,10 @@ def extract_metadata_from_pdf(reader):
     }
-def convert_pandoc(input_file):
     # Convert the file to markdown with pandoc
     output_file = f"{random_word(16)}.md"
     result = subprocess.call(
@@ -66,31 +70,34 @@ def convert_pandoc(input_file):
     if result != 0:
         raise ValueError("Error converting file to markdown with pandoc")
-    # Read the file and delete
     with open(output_file, "r") as f:
         markdown = f.read()
     os.remove(output_file)
     return markdown
 @spaces.GPU
-def convert(input_file):
     plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
-    if any(input_file.endswith(ft) for ft in plain_text_filetypes):
         with open(input_file, "r") as f:
             return f.read(), {}
-    if input_file.endswith(".pdf"):
         return convert_pdf(input_file)
-    return convert_pandoc(input_file), {}
 gr.Interface(
     convert,
-    inputs=gr.File(label="Upload File", type="filepath"),
     outputs=[
         gr.Text(label="Markdown"),
         gr.JSON(label="Metadata"),

 import spaces
 import subprocess
 import os
+import shutil
 import string
 import random
 from pypdf import PdfReader
     }
+def convert_pandoc(input_file, filename):
+    # Temporarily copy the file
+    shutil.copyfile(input_file, filename)
     # Convert the file to markdown with pandoc
     output_file = f"{random_word(16)}.md"
     result = subprocess.call(
     if result != 0:
         raise ValueError("Error converting file to markdown with pandoc")
+    # Read the file and delete temporary files
     with open(output_file, "r") as f:
         markdown = f.read()
     os.remove(output_file)
+    os.remove(filename)
     return markdown
 @spaces.GPU
+def convert(input_file, filename):
     plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
+    if any(filename.endswith(ft) for ft in plain_text_filetypes):
         with open(input_file, "r") as f:
             return f.read(), {}
+    if filename.endswith(".pdf"):
         return convert_pdf(input_file)
+    return convert_pandoc(input_file, filename), {}
+# We accept a filename because the gradio JS interface removes this information
+# and it's critical for choosing the correct processing pipeline
 gr.Interface(
     convert,
+    inputs=[gr.File(label="Upload File", type="filepath"), gr.Text(label="Filename")],
     outputs=[
         gr.Text(label="Markdown"),
         gr.JSON(label="Metadata"),