engrrifatullah commited on
Commit
47b61ab
·
verified ·
1 Parent(s): f395fbd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pytesseract
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image
6
+ import pdfkit
7
+ from docx import Document
8
+ from transformers import pipeline
9
+
10
+ # Set up OCR pipeline (you can replace with Hugging Face model)
11
+ ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
12
+
13
+ # Streamlit UI
14
+ st.title("Handwritten Text Extractor")
15
+
16
+ # File uploader
17
+ uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
18
+
19
+ if uploaded_file is not None:
20
+ image = Image.open(uploaded_file)
21
+ st.image(image, caption="Uploaded Image", use_column_width=True)
22
+
23
+ # Convert image to numpy array
24
+ image_np = np.array(image)
25
+
26
+ # Extract text using OCR model
27
+ extracted_text = ocr_pipeline(image_np)[0]['generated_text']
28
+
29
+ # Display extracted text
30
+ st.subheader("Extracted Text")
31
+ st.write(extracted_text)
32
+
33
+ # Save as DOCX
34
+ doc = Document()
35
+ doc.add_paragraph(extracted_text)
36
+ docx_filename = "extracted_text.docx"
37
+ doc.save(docx_filename)
38
+
39
+ # Save as PDF
40
+ pdf_filename = "extracted_text.pdf"
41
+ pdfkit.from_string(extracted_text, pdf_filename)
42
+
43
+ # Download buttons
44
+ st.download_button("Download as DOCX", data=open(docx_filename, "rb"), file_name=docx_filename)
45
+ st.download_button("Download as PDF", data=open(pdf_filename, "rb"), file_name=pdf_filename)
46
+