Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

AzizWazir commited on Dec 29, 2024

Commit

c8e848b

verified ·

1 Parent(s): 94b0e1a

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -24

app.py CHANGED Viewed

@@ -1,37 +1,66 @@
 import streamlit as st
 import fitz  # PyMuPDF
 import pandas as pd
-def extract_tables_from_pdf(uploaded_file):
-    # Open the uploaded PDF file (this will be a file-like object)
-    doc = fitz.open(stream=uploaded_file.read())  # Use .read() to pass the file content as a stream
-    tables = []
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        text = page.get_text("text")
-        rows = text.split("\n")
-        table_data = [row.split() for row in rows if row]
-        if table_data:
-            tables.append(table_data)
-    return tables
-def main():
-    st.title("PDF Table Extraction Tool")
     # File uploader widget in Streamlit
     uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
     if uploaded_file is not None:
-        # Call function to process the uploaded PDF file
-        tables = extract_tables_from_pdf(uploaded_file)
-        if tables:
-            st.write("Extracted Tables:")
-            for table in tables:
-                st.write(pd.DataFrame(table))
-        else:
-            st.write("No tables found in the PDF.")
 if __name__ == "__main__":
     main()

 import streamlit as st
 import fitz  # PyMuPDF
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
 import pandas as pd
+from docx import Document
+import io
+# OCR function to convert image-based PDF to text
+def extract_text_from_image_pdf(uploaded_file):
+    # Convert PDF to images
+    images = convert_from_path(uploaded_file)
+    extracted_text = []
+    for image in images:
+        # Use pytesseract to do OCR on the image
+        text = pytesseract.image_to_string(image)
+        extracted_text.append(text)
+    return "\n".join(extracted_text)
+# Save text to Word document
+def save_to_word(text, output_filename):
+    doc = Document()
+    doc.add_paragraph(text)
+    doc.save(output_filename)
+# Save text to Excel document
+def save_to_excel(text, output_filename):
+    # Split the text into rows and columns (simplified, adjust based on your data)
+    rows = text.split("\n")
+    table_data = [row.split() for row in rows if row]  # You can adjust this for proper column splitting
+    df = pd.DataFrame(table_data)
+    df.to_excel(output_filename, index=False)
+# Main function
+def main():
+    st.title("PDF (Image-based) to Text-based Document Converter")
     # File uploader widget in Streamlit
     uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
     if uploaded_file is not None:
+        # Convert image-based PDF to text using OCR
+        extracted_text = extract_text_from_image_pdf(uploaded_file)
+        st.write("Extracted Text:")
+        st.text_area("Text from PDF", extracted_text, height=300)
+        # Convert the extracted text to Word or Excel
+        if st.button("Save as Word"):
+            # Save to Word file
+            word_filename = "extracted_text.docx"
+            save_to_word(extracted_text, word_filename)
+            st.success(f"Saved to {word_filename}")
+        if st.button("Save as Excel"):
+            # Save to Excel file
+            excel_filename = "extracted_text.xlsx"
+            save_to_excel(extracted_text, excel_filename)
+            st.success(f"Saved to {excel_filename}")
 if __name__ == "__main__":
     main()