Spaces:
Sleeping
Sleeping
File size: 2,685 Bytes
1168986 bdc3ab9 1168986 b1cf141 bdc3ab9 1168986 bdc3ab9 1168986 bdc3ab9 1168986 bdc3ab9 1168986 bdc3ab9 1168986 bdc3ab9 1168986 bdc3ab9 1168986 bdc3ab9 1168986 bdc3ab9 1168986 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import os
import streamlit as st
from pdf2image import convert_from_path
# Path to your PDF file
pdf_path = "path_to_your_pdf.pdf"
# Path to Poppler binary (optional if already in PATH)
poppler_path = r"C:\path\to\poppler\bin" # Update this path as needed
try:
# Convert PDF to images
images = convert_from_path(pdf_path, poppler_path=poppler_path)
print(f"Converted {len(images)} pages to images successfully!")
except Exception as e:
print(f"An error occurred: {e}")
# Function to extract text from an image-based PDF
def extract_text_from_image_pdf(pdf_path):
images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH)
extracted_text = []
for page_num, image in enumerate(images, start=1):
text = pytesseract.image_to_string(image)
extracted_text.append(f"Page {page_num}:\n{text}")
return "\n".join(extracted_text)
# Function to save extracted text to a Word file
def save_text_to_word(text, output_path):
doc = Document()
doc.add_paragraph(text)
doc.save(output_path)
# Function to save extracted text to an Excel file
def save_text_to_excel(text, output_path):
data = {"Text": text.split("\n")}
df = pd.DataFrame(data)
df.to_excel(output_path, index=False)
def main():
st.title("PDF Image to Text Converter")
st.write("Upload an image-based PDF to extract text and save as text, Word, or Excel format.")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file is not None:
with st.spinner("Processing..."):
tmp_file_path = "uploaded_file.pdf"
with open(tmp_file_path, "wb") as f:
f.write(uploaded_file.read())
try:
extracted_text = extract_text_from_image_pdf(tmp_file_path)
st.success("Text extracted successfully!")
st.text_area("Extracted Text", extracted_text, height=300)
# Options to download text in different formats
if st.button("Download as Word"):
save_text_to_word(extracted_text, "output.docx")
st.download_button("Download Word File", open("output.docx", "rb"), "output.docx")
if st.button("Download as Excel"):
save_text_to_excel(extracted_text, "output.xlsx")
st.download_button("Download Excel File", open("output.xlsx", "rb"), "output.xlsx")
except Exception as e:
st.error(f"An error occurred: {e}")
finally:
if os.path.exists(tmp_file_path):
os.remove(tmp_file_path)
if __name__ == "__main__":
main()
|