twoimo commited on
Commit
ff85888
·
verified ·
1 Parent(s): 8a5fed5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -60
app.py CHANGED
@@ -1,86 +1,73 @@
1
  import streamlit as st
2
- from transformers import AutoProcessor, AutoModelForImageTextToText
3
- import torch
4
  from PIL import Image
5
- import io
6
 
7
- st.set_page_config(page_title="GLM-OCR", layout="centered")
8
 
9
- st.title("🎯 GLM-OCR: Multimodal OCR Model")
10
- st.markdown("Upload an image to extract text using the GLM-OCR model.")
 
11
 
12
- # Load model with caching
 
 
 
 
13
  @st.cache_resource
14
- def load_model():
15
  try:
16
- MODEL_PATH = "zai-org/GLM-OCR"
17
- processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
18
- model = AutoModelForImageTextToText.from_pretrained(
19
- MODEL_PATH,
20
- torch_dtype=torch.float16,
21
- device_map="auto",
22
- trust_remote_code=True,
23
- )
24
- return processor, model
25
  except Exception as e:
26
- st.error(f"Error loading model: {str(e)}")
27
- return None, None
28
 
29
- # Load model
30
- with st.spinner("Loading GLM-OCR model... This may take a moment."):
31
- processor, model = load_model()
32
 
33
- if processor is None or model is None:
34
- st.error("Failed to load the model. Please try refreshing the page.")
35
  st.stop()
36
 
37
  # File uploader
38
  uploaded_file = st.file_uploader(
39
- "Choose an image",
40
- type=["jpg", "jpeg", "png", "bmp", "gif"],
41
  )
42
 
43
  if uploaded_file is not None:
44
- # Display the image
45
- image = Image.open(uploaded_file).convert("RGB")
46
  st.image(image, caption="Uploaded Image", use_column_width=True)
47
 
48
- # Process the image
49
  if st.button("Extract Text", type="primary"):
50
- with st.spinner("Processing image... Please wait."):
51
  try:
52
- # Prepare input
53
- messages = [{
54
- "role": "user",
55
- "content": [
56
- {"type": "image", "image": image},
57
- {"type": "text", "text": "Text Recognition:"}
58
- ],
59
- }]
60
-
61
- # Process
62
- inputs = processor.apply_chat_template(
63
- messages, tokenize=True, add_generation_prompt=True,
64
- return_dict=True, return_tensors="pt"
65
- ).to(model.device)
66
-
67
- inputs.pop("token_type_ids", None)
68
 
69
- # Generate
70
- with torch.no_grad():
71
- generated_ids = model.generate(**inputs, max_new_tokens=2048)
72
-
73
- # Decode
74
- output_text = processor.decode(
75
- generated_ids[0][inputs["input_ids"].shape[1]:],
76
- skip_special_tokens=True,
77
- )
78
-
79
- st.success("Text extraction completed!")
80
- st.text_area("Extracted Text", value=output_text, height=300)
81
 
 
 
 
 
 
 
 
 
 
 
82
  except Exception as e:
83
- st.error(f"Error processing image: {str(e)}")
84
 
85
  st.markdown("---")
86
- st.markdown("Powered by GLM-OCR from [ZAI](https://huggingface.co/zai-org)")
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from paddleocr import PaddleOCR
 
3
  from PIL import Image
4
+ import numpy as np
5
 
6
+ st.set_page_config(page_title="OCR Demo", layout="centered")
7
 
8
+ st.title("📝 Simple OCR Demo")
9
+ st.markdown("""
10
+ This is a lightweight OCR demo using PaddleOCR.
11
 
12
+ **Note**: Originally intended for GLM-OCR, but that model requires GPU resources.
13
+ This demo uses PaddleOCR instead, which works on CPU.
14
+ """)
15
+
16
+ # Initialize PaddleOCR
17
  @st.cache_resource
18
+ def load_ocr():
19
  try:
20
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)
21
+ return ocr
 
 
 
 
 
 
 
22
  except Exception as e:
23
+ st.error(f"Error loading OCR: {e}")
24
+ return None
25
 
26
+ with st.spinner("Loading OCR model..."):
27
+ ocr = load_ocr()
 
28
 
29
+ if ocr is None:
30
+ st.error("Failed to load OCR model. Please try refreshing.")
31
  st.stop()
32
 
33
  # File uploader
34
  uploaded_file = st.file_uploader(
35
+ "Upload an image",
36
+ type=["jpg", "jpeg", "png", "bmp"],
37
  )
38
 
39
  if uploaded_file is not None:
40
+ # Display image
41
+ image = Image.open(uploaded_file)
42
  st.image(image, caption="Uploaded Image", use_column_width=True)
43
 
 
44
  if st.button("Extract Text", type="primary"):
45
+ with st.spinner("Processing..."):
46
  try:
47
+ # Convert to numpy array
48
+ img_array = np.array(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Run OCR
51
+ result = ocr.ocr(img_array, cls=True)
 
 
 
 
 
 
 
 
 
 
52
 
53
+ if result and result[0]:
54
+ st.success("Text extraction completed!")
55
+
56
+ # Extract text
57
+ extracted_text = "\n".join([line[1][0] for line in result[0]])
58
+
59
+ st.text_area("Extracted Text", value=extracted_text, height=300)
60
+ else:
61
+ st.warning("No text found in the image.")
62
+
63
  except Exception as e:
64
+ st.error(f"Error: {str(e)}")
65
 
66
  st.markdown("---")
67
+ st.markdown("""
68
+ **About GLM-OCR**:
69
+ The original [GLM-OCR model](https://huggingface.co/zai-org/GLM-OCR) is a powerful 0.9B parameter
70
+ multimodal OCR model, but requires GPU resources to run efficiently.
71
+
72
+ For CPU-only environments like Hugging Face CPU Spaces, lighter alternatives like PaddleOCR are more suitable.
73
+ """)