Spaces:

edesaras
/

CircuitSketchRecognition

Sleeping

App Files Files Community

edesaras commited on Apr 28, 2024

Commit

4929692

1 Parent(s): 84e12ef

Added OCR Model, replaced old YOLO model with new one trained using rotation augmentation, streamlit tabs -> multipage app

Browse files

Files changed (6) hide show

.gitignore +3 -0
Hello.py +15 -0
weights.pt → models/YOLO/weights.pt +2 -2
pages/Capture_Image.py +26 -0
pages/Upload_An_Image.py +24 -0
app.py → utils.py +51 -40

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+main.ipynb
+blankexample.jpeg

Hello.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+if __name__ == "__main__":
+    # set page configurations and display/annotation options
+    st.set_page_config(
+        page_title="Circuit Sketch Recognizer",
+        layout="wide"
+    )
+    st.title("Circuit Sketch Recognition")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.image('example1.jpg', use_column_width=True, caption='Example 1')
+    with col2:
+        st.image('example2.jpg', use_column_width=True, caption='Example 2')

weights.pt → models/YOLO/weights.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0791285b924f954f0370a13739ca87e2569a90bf935c0afdd69797f9dc2bbf0a
-size 52120385

 version https://git-lfs.github.com/spec/v1
+oid sha256:8f93346972611fd027af6c1b1dfc9cd818f48e794d682466e2ef3ba6042721df
+size 52163457

pages/Capture_Image.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import streamlit as st
+import sys
+import os
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+from utils import load_model, image_capture_cb, load_ocr_model
+if __name__ == "__main__":
+    # set page configurations and display/annotation options
+    st.set_page_config(
+        page_title="Circuit Sketch Recognizer",
+        layout="wide"
+    )
+    with st.sidebar:
+        font_size = st.slider(label="Font Size", min_value=6, max_value=64, step=1, value=24)
+        line_width = st.slider(label="Bounding Box Line Thickness", min_value=1, max_value=8, step=1, value=3)
+    model = load_model()
+    ocr_model, ocr_processor = load_ocr_model()
+    # Camera Input allows user to take a picture
+    col1, col2 = st.columns(2)
+    with col1:
+        capture = st.camera_input("Take a picture with Camera")
+    if capture is not None:
+        image_capture_cb(model, ocr_model, ocr_processor, capture, font_size, line_width, col2)

pages/Upload_An_Image.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+import sys
+import os
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+from utils import load_model, file_uploader_cb, load_ocr_model
+if __name__ == "__main__":
+    # set page configurations and display/annotation options
+    st.set_page_config(
+        page_title="Circuit Sketch Recognizer",
+        layout="wide"
+    )
+    with st.sidebar:
+        font_size = st.slider(label="Font Size", min_value=6, max_value=64, step=1, value=24)
+        line_width = st.slider(label="Bounding Box Line Thickness", min_value=1, max_value=8, step=1, value=3)
+    model = load_model()
+    ocr_model, ocr_processor = load_ocr_model()
+    # File uploader allows user to add their own image
+    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+    if uploaded_file is not None:
+        file_uploader_cb(model, ocr_model, ocr_processor, uploaded_file, font_size, line_width)

app.py → utils.py RENAMED Viewed

@@ -1,16 +1,26 @@
 import streamlit as st
 from PIL import Image
-import numpy as np
 from ultralytics import YOLO  # Make sure this import works in your Hugging Face environment
 from io import BytesIO
 @st.cache_resource
 def load_model():
     """
         Load and cache the model
     """
-    model = YOLO("weights.pt")  # Adjust path if needed
     return model
 def predict(model, image, font_size, line_width):
@@ -21,16 +31,37 @@ def predict(model, image, font_size, line_width):
     r = results[0]
     im_bgr = r.plot(conf=False, pil=True, font_size=font_size, line_width=line_width)  # Returns a PIL image if pil=True
     im_rgb = Image.fromarray(im_bgr[..., ::-1])  # Convert BGR to RGB
-    return im_rgb
-def file_uploader_cb(uploaded_file, font_size, line_width):
     image = Image.open(uploaded_file).convert("RGB")
     col1, col2 = st.columns(2)
     with col1:
         # Display Uploaded image
         st.image(image, caption='Uploaded Image', use_column_width=True)
     # Perform inference
-    annotated_img = predict(model, image, font_size, line_width)
     with col2:
         # Display the prediction
         st.image(annotated_img, caption='Prediction', use_column_width=True)
@@ -38,11 +69,18 @@ def file_uploader_cb(uploaded_file, font_size, line_width):
     imbuffer = BytesIO()
     annotated_img.save(imbuffer, format="JPEG")
     st.download_button("Download Annotated Image", data=imbuffer, file_name="Annotated_Sketch.jpeg", mime="image/jpeg", key="upload")
-def image_capture_cb(capture, font_size, line_width, col):
     image = Image.open(capture).convert("RGB")
     # Perform inference
-    annotated_img = predict(model, image, font_size, line_width)
     with col:
         # Display the prediction
         st.image(annotated_img, caption='Prediction', use_column_width=True)
@@ -51,36 +89,9 @@ def image_capture_cb(capture, font_size, line_width, col):
     annotated_img.save(imbuffer, format="JPEG")
     st.download_button("Download Annotated Image", data=imbuffer, file_name="Annotated_Sketch.jpeg", mime="image/jpeg", key="capture")
-if __name__ == "__main__":
-    # set page configurations and display/annotation options
-    st.set_page_config(
-        page_title="Circuit Sketch Recognizer",
-        layout="wide"
-    )
-    st.title("Circuit Sketch Recognition")
-    with st.sidebar:
-        font_size = st.slider(label="Font Size", min_value=6, max_value=64, step=1, value=24)
-        line_width = st.slider(label="Bounding Box Line Thickness", min_value=1, max_value=8, step=1, value=3)
-    model = load_model()
-    # user specifies to take/upload picture, view examples
-    tabs = st.tabs(["Capture Picture", "Upload Your Image", "Show Examples"])
-    with tabs[0]:
-        # File uploader allows user to add their own image
-        uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
-        if uploaded_file is not None:
-            file_uploader_cb(uploaded_file, font_size, line_width)
-    with tabs[1]:
-        # Camera Input allows user to take a picture
-        col1, col2 = st.columns(2)
-        with col1:
-            capture = st.camera_input("Take a picture with Camera")
-        if capture is not None:
-            image_capture_cb(capture, font_size, line_width, col2)
-    with tabs[2]:
-        col1, col2 = st.columns(2)
-        with col1:
-            st.image('example1.jpg', use_column_width=True, caption='Example 1')
-        with col2:
-            st.image('example2.jpg', use_column_width=True, caption='Example 2')

 import streamlit as st
 from PIL import Image
 from ultralytics import YOLO  # Make sure this import works in your Hugging Face environment
 from io import BytesIO
+import numpy as np
+import pandas as pd
+from transformers import VisionEncoderDecoderModel, TrOCRProcessor
+@st.cache_resource
+def load_ocr_model():
+    """
+        Load and cache the ocr model and processor
+    """
+    model = VisionEncoderDecoderModel.from_pretrained('edesaras/TROCR_finetuned_on_CSTA', cache_dir='./models/TrOCR')
+    processor = TrOCRProcessor.from_pretrained("edesaras/TROCR_finetuned_on_CSTA", cache_dir='./models/TrOCR')
+    return model, processor
 @st.cache_resource
 def load_model():
     """
         Load and cache the model
     """
+    model = YOLO('./models/YOLO/weights.pt')
     return model
 def predict(model, image, font_size, line_width):
     r = results[0]
     im_bgr = r.plot(conf=False, pil=True, font_size=font_size, line_width=line_width)  # Returns a PIL image if pil=True
     im_rgb = Image.fromarray(im_bgr[..., ::-1])  # Convert BGR to RGB
+    return im_rgb, r
+def extract_text_patches(result, image):
+    image = np.array(image)
+    text_bboxes = []
+    for i, label in enumerate([result.names[id.item()] for id in result.boxes.cls]):
+        if label == 'text':
+            bbox = result.boxes.xyxy[i]
+            text_bboxes.append([round(i.item()) for i in bbox])
+    crops = []
+    for box in text_bboxes:
+        xmin, ymin, xmax, ymax = box
+        crop_img = image[ymin:ymax, xmin:xmax]
+        crops.append(crop_img)
+    return crops, text_bboxes
+def ocr_predict(model, processor, crops):
+    pixel_values = processor(crops, return_tensors="pt").pixel_values
+    # Generate text with TrOCR
+    generated_ids = model.generate(pixel_values)
+    texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    return texts
+def file_uploader_cb(model, ocr_model, ocr_processor, uploaded_file, font_size, line_width):
     image = Image.open(uploaded_file).convert("RGB")
     col1, col2 = st.columns(2)
     with col1:
         # Display Uploaded image
         st.image(image, caption='Uploaded Image', use_column_width=True)
     # Perform inference
+    annotated_img, result = predict(model, image, font_size, line_width)
     with col2:
         # Display the prediction
         st.image(annotated_img, caption='Prediction', use_column_width=True)
     imbuffer = BytesIO()
     annotated_img.save(imbuffer, format="JPEG")
     st.download_button("Download Annotated Image", data=imbuffer, file_name="Annotated_Sketch.jpeg", mime="image/jpeg", key="upload")
+    st.subheader('Transcription')
+    crops, text_bboxes = extract_text_patches(result, image)
+    texts = ocr_predict(ocr_model, ocr_processor, crops)
+    transcription_df = pd.DataFrame(zip(texts, *np.array(text_bboxes).T, [st.image(crop) for crop in crops]),
+             columns=['Transcription', 'xmin', 'ymin', 'xmax', 'ymax', 'Image'])
+    st.dataframe(transcription_df)
+def image_capture_cb(model, ocr_model, ocr_processor, capture, font_size, line_width, col):
     image = Image.open(capture).convert("RGB")
     # Perform inference
+    annotated_img, result = predict(model, image, font_size, line_width)
     with col:
         # Display the prediction
         st.image(annotated_img, caption='Prediction', use_column_width=True)
     annotated_img.save(imbuffer, format="JPEG")
     st.download_button("Download Annotated Image", data=imbuffer, file_name="Annotated_Sketch.jpeg", mime="image/jpeg", key="capture")
+    st.subheader('Transcription')
+    crops, text_bboxes = extract_text_patches(result, image)
+    texts = ocr_predict(ocr_model, ocr_processor, crops)
+    transcription_df = pd.DataFrame(zip(texts, *np.array(text_bboxes).T),
+             columns=['Transcription', 'xmin', 'ymin', 'xmax', 'ymax'])
+    st.dataframe(transcription_df)