Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -77,34 +77,39 @@ def text_to_txt_bytes(text):
|
|
| 77 |
except Exception as e:
|
| 78 |
raise RuntimeError(f"Error during TXT conversion: {e}")
|
| 79 |
|
| 80 |
-
def on_extract(
|
| 81 |
"""
|
| 82 |
Callback function to extract text from PDF and return CSV and TXT data.
|
| 83 |
|
| 84 |
Args:
|
| 85 |
-
|
| 86 |
|
| 87 |
Returns:
|
| 88 |
-
tuple: CSV
|
| 89 |
"""
|
| 90 |
-
if
|
| 91 |
-
return
|
| 92 |
|
| 93 |
try:
|
| 94 |
# Extract text and create DataFrame
|
| 95 |
-
df, full_text = extract_text_with_langchain_pdf(
|
| 96 |
|
| 97 |
# Convert DataFrame to CSV bytes
|
| 98 |
csv_bytes = df_to_csv_bytes(df)
|
| 99 |
-
csv_filename = f"{
|
| 100 |
|
| 101 |
# Convert full text to TXT bytes
|
| 102 |
txt_bytes = text_to_txt_bytes(full_text)
|
| 103 |
-
txt_filename = f"{
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
except Exception as e:
|
| 107 |
-
return
|
| 108 |
|
| 109 |
with gr.Blocks() as demo:
|
| 110 |
gr.Markdown("# 📄 PDF Text Extractor with Metadata and Multiple Exports")
|
|
@@ -113,7 +118,7 @@ with gr.Blocks() as demo:
|
|
| 113 |
pdf_input = gr.File(
|
| 114 |
label="Upload PDF",
|
| 115 |
file_types=[".pdf"],
|
| 116 |
-
type="filepath",
|
| 117 |
interactive=True
|
| 118 |
)
|
| 119 |
|
|
@@ -121,15 +126,17 @@ with gr.Blocks() as demo:
|
|
| 121 |
extract_button = gr.Button("Extract and Download")
|
| 122 |
|
| 123 |
with gr.Row():
|
| 124 |
-
csv_download = gr.
|
| 125 |
-
label="Download Extracted CSV"
|
|
|
|
| 126 |
)
|
| 127 |
-
txt_download = gr.
|
| 128 |
-
label="Download Full Text"
|
|
|
|
| 129 |
)
|
| 130 |
|
| 131 |
with gr.Row():
|
| 132 |
-
|
| 133 |
label="Status",
|
| 134 |
interactive=False,
|
| 135 |
lines=2
|
|
@@ -138,12 +145,12 @@ with gr.Blocks() as demo:
|
|
| 138 |
extract_button.click(
|
| 139 |
fn=on_extract,
|
| 140 |
inputs=pdf_input,
|
| 141 |
-
outputs=[csv_download, txt_download,
|
| 142 |
)
|
| 143 |
|
| 144 |
gr.Markdown("""
|
| 145 |
---
|
| 146 |
-
Developed Gradio and LangChain.
|
| 147 |
""")
|
| 148 |
|
| 149 |
# Launch the Gradio app
|
|
|
|
| 77 |
except Exception as e:
|
| 78 |
raise RuntimeError(f"Error during TXT conversion: {e}")
|
| 79 |
|
| 80 |
+
def on_extract(pdf_file_path):
|
| 81 |
"""
|
| 82 |
Callback function to extract text from PDF and return CSV and TXT data.
|
| 83 |
|
| 84 |
Args:
|
| 85 |
+
pdf_file_path (str): The file path to the uploaded PDF.
|
| 86 |
|
| 87 |
Returns:
|
| 88 |
+
tuple: CSV download object, TXT download object, Status message.
|
| 89 |
"""
|
| 90 |
+
if not pdf_file_path:
|
| 91 |
+
return None, None, "No file uploaded."
|
| 92 |
|
| 93 |
try:
|
| 94 |
# Extract text and create DataFrame
|
| 95 |
+
df, full_text = extract_text_with_langchain_pdf(pdf_file_path)
|
| 96 |
|
| 97 |
# Convert DataFrame to CSV bytes
|
| 98 |
csv_bytes = df_to_csv_bytes(df)
|
| 99 |
+
csv_filename = f"{pdf_file_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_extracted.csv"
|
| 100 |
|
| 101 |
# Convert full text to TXT bytes
|
| 102 |
txt_bytes = text_to_txt_bytes(full_text)
|
| 103 |
+
txt_filename = f"{pdf_file_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_full_text.txt"
|
| 104 |
|
| 105 |
+
# Return CSV and TXT files along with a success message
|
| 106 |
+
return (
|
| 107 |
+
(csv_bytes, csv_filename),
|
| 108 |
+
(txt_bytes, txt_filename),
|
| 109 |
+
"Extraction successful!"
|
| 110 |
+
)
|
| 111 |
except Exception as e:
|
| 112 |
+
return None, None, f"Extraction failed: {e}"
|
| 113 |
|
| 114 |
with gr.Blocks() as demo:
|
| 115 |
gr.Markdown("# 📄 PDF Text Extractor with Metadata and Multiple Exports")
|
|
|
|
| 118 |
pdf_input = gr.File(
|
| 119 |
label="Upload PDF",
|
| 120 |
file_types=[".pdf"],
|
| 121 |
+
type="filepath", # Using "filepath" as per Gradio's valid options
|
| 122 |
interactive=True
|
| 123 |
)
|
| 124 |
|
|
|
|
| 126 |
extract_button = gr.Button("Extract and Download")
|
| 127 |
|
| 128 |
with gr.Row():
|
| 129 |
+
csv_download = gr.File(
|
| 130 |
+
label="Download Extracted CSV",
|
| 131 |
+
interactive=False
|
| 132 |
)
|
| 133 |
+
txt_download = gr.File(
|
| 134 |
+
label="Download Full Text",
|
| 135 |
+
interactive=False
|
| 136 |
)
|
| 137 |
|
| 138 |
with gr.Row():
|
| 139 |
+
status_output = gr.Textbox(
|
| 140 |
label="Status",
|
| 141 |
interactive=False,
|
| 142 |
lines=2
|
|
|
|
| 145 |
extract_button.click(
|
| 146 |
fn=on_extract,
|
| 147 |
inputs=pdf_input,
|
| 148 |
+
outputs=[csv_download, txt_download, status_output]
|
| 149 |
)
|
| 150 |
|
| 151 |
gr.Markdown("""
|
| 152 |
---
|
| 153 |
+
Developed with ❤️ using Gradio and LangChain.
|
| 154 |
""")
|
| 155 |
|
| 156 |
# Launch the Gradio app
|