Sam20202 commited on
Commit
0533780
Β·
0 Parent(s):

Initial deploy

Browse files
.gitignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ *.pyo
6
+ *.pyd
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+
11
+ # HuggingFace model cache (don't commit 2GB of weights)
12
+ .cache/
13
+
14
+ # OS
15
+ .DS_Store
16
+ Thumbs.db
17
+
18
+ # Temp files
19
+ *.tmp
20
+ *.log
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── GLM-OCR β€” Dockerfile (Hugging Face Spaces) ────────────────────────────
2
+ #
3
+ # HF Spaces builds this automatically when you push to your Space repo.
4
+ # No local Docker needed β€” HF handles the build and hosting.
5
+ # ──────────────────────────────────────────────────────────────────────────
6
+
7
+ FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
8
+
9
+ # ── System deps ────────────────────────────────────────────────────────────
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ libgl1-mesa-glx \
12
+ libglib2.0-0 \
13
+ curl \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # ── Workdir ────────────────────────────────────────────────────────────────
17
+ WORKDIR /app
18
+
19
+ # ── Python deps ────────────────────────────────────────────────────────────
20
+ COPY requirements.txt ./
21
+ RUN pip install --no-cache-dir -r requirements.txt
22
+
23
+ # ── Copy source (flat structure) ───────────────────────────────────────────
24
+ COPY main.py ocr_engine.py ./
25
+ COPY frontend/ ./frontend/
26
+
27
+ # ── Model weights download at first startup (not baked into image) ─────────
28
+ # HF Spaces caches ~/.cache/huggingface across restarts on paid tiers.
29
+ # On the free tier the model (~1-2 GB) re-downloads on each cold start.
30
+
31
+ # ── Expose & run (HF Spaces requires port 7860) ────────────────────────────
32
+ EXPOSE 7860
33
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
Extension/background.js ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // background.js β€” Service worker
2
+ // Handles: tab screenshot, image crop, OCR API call, result relay
3
+
4
+ const OCR_ENDPOINT = "http://localhost:8000/ocr";
5
+ const OCR_MODE = "recognize"; // or "parse"
6
+
7
+ // ── Listen for messages from content.js ─────────────────────────────────────
8
+
9
+ chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
10
+
11
+ if (msg.type === "CAPTURE_REGION") {
12
+ handleCapture(msg.rect, sender.tab)
13
+ .then(result => sendResponse({ success: true, ...result }))
14
+ .catch(error => sendResponse({ success: false, error: error.message }));
15
+ return true; // keep channel open for async
16
+ }
17
+
18
+ if (msg.type === "PING") {
19
+ checkServer().then(ok => sendResponse({ ok }));
20
+ return true;
21
+ }
22
+
23
+ if (msg.type === "OPEN_SIDEBAR") {
24
+ // Open the sidebar as a side panel in the current tab
25
+ chrome.tabs.sendMessage(sender.tab.id, { type: "SHOW_SIDEBAR" });
26
+ return false;
27
+ }
28
+
29
+ });
30
+
31
+ // ── Capture + crop + OCR ─────────────────────────────────────────────────────
32
+
33
+ async function handleCapture(rect, tab) {
34
+ // 1. Capture the entire visible tab as a data URL
35
+ const dataUrl = await chrome.tabs.captureVisibleTab(tab.windowId, {
36
+ format: "png",
37
+ quality: 100,
38
+ });
39
+
40
+ // 2. Crop to the selected rect using OffscreenCanvas
41
+ const croppedBlob = await cropImage(dataUrl, rect);
42
+
43
+ // 3. Send to GLM-OCR backend
44
+ const formData = new FormData();
45
+ formData.append("file", croppedBlob, "selection.png");
46
+ formData.append("mode", OCR_MODE);
47
+
48
+ const res = await fetch(OCR_ENDPOINT, {
49
+ method: "POST",
50
+ body: formData,
51
+ });
52
+
53
+ if (!res.ok) {
54
+ const err = await res.json().catch(() => ({}));
55
+ throw new Error(err.detail || `Server returned ${res.status}`);
56
+ }
57
+
58
+ const data = await res.json();
59
+
60
+ // Also store the cropped image as a data URL for display in the sidebar
61
+ const croppedDataUrl = await blobToDataUrl(croppedBlob);
62
+
63
+ return {
64
+ text: data.text,
65
+ word_count: data.word_count,
66
+ char_count: data.char_count,
67
+ latency_ms: data.latency_ms,
68
+ mode: data.mode,
69
+ device: data.device,
70
+ imageDataUrl: croppedDataUrl,
71
+ timestamp: new Date().toISOString(),
72
+ };
73
+ }
74
+
75
+ // ── Image cropping using OffscreenCanvas ─────────────────────────────────────
76
+
77
+ async function cropImage(dataUrl, rect) {
78
+ // Decode the full screenshot
79
+ const res = await fetch(dataUrl);
80
+ const blob = await res.blob();
81
+ const bitmap = await createImageBitmap(blob);
82
+
83
+ // Scale rect by device pixel ratio (already baked into captureVisibleTab)
84
+ // captureVisibleTab captures at device pixel ratio already, so rect coords
85
+ // from getBoundingClientRect need to be scaled.
86
+ const dpr = rect.dpr || 1;
87
+ const sx = Math.round(rect.x * dpr);
88
+ const sy = Math.round(rect.y * dpr);
89
+ const sw = Math.round(rect.width * dpr);
90
+ const sh = Math.round(rect.height * dpr);
91
+
92
+ // Clamp to bitmap bounds
93
+ const cx = Math.max(0, sx);
94
+ const cy = Math.max(0, sy);
95
+ const cw = Math.min(sw, bitmap.width - cx);
96
+ const ch = Math.min(sh, bitmap.height - cy);
97
+
98
+ const canvas = new OffscreenCanvas(cw, ch);
99
+ const ctx = canvas.getContext("2d");
100
+ ctx.drawImage(bitmap, cx, cy, cw, ch, 0, 0, cw, ch);
101
+
102
+ return canvas.convertToBlob({ type: "image/png" });
103
+ }
104
+
105
+ function blobToDataUrl(blob) {
106
+ return new Promise((resolve, reject) => {
107
+ const reader = new FileReader();
108
+ reader.onload = () => resolve(reader.result);
109
+ reader.onerror = reject;
110
+ reader.readAsDataURL(blob);
111
+ });
112
+ }
113
+
114
+ // ── Server health check ───────────────────────────────────────────────────────
115
+
116
+ async function checkServer() {
117
+ try {
118
+ const r = await fetch("http://localhost:8000/health", { signal: AbortSignal.timeout(3000) });
119
+ const d = await r.json();
120
+ return d.status === "ok";
121
+ } catch {
122
+ return false;
123
+ }
124
+ }
Extension/content.css ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* content.css β€” Styles injected into every page for the selection overlay */
2
+
3
+ /* ── Overlay ── */
4
+ #glmocr-overlay {
5
+ position: fixed !important;
6
+ inset: 0 !important;
7
+ background: rgba(0, 0, 0, 0.45) !important;
8
+ z-index: 2147483646 !important;
9
+ cursor: crosshair !important;
10
+ user-select: none !important;
11
+ }
12
+
13
+ /* ── Hint text ── */
14
+ #glmocr-hint {
15
+ position: absolute !important;
16
+ top: 20px !important;
17
+ left: 50% !important;
18
+ transform: translateX(-50%) !important;
19
+ background: rgba(0, 0, 0, 0.8) !important;
20
+ color: #f5f0e8 !important;
21
+ font-family: 'IBM Plex Mono', monospace, monospace !important;
22
+ font-size: 13px !important;
23
+ padding: 10px 18px !important;
24
+ border-radius: 4px !important;
25
+ letter-spacing: 0.04em !important;
26
+ pointer-events: none !important;
27
+ white-space: nowrap !important;
28
+ border: 1px solid rgba(255,255,255,0.15) !important;
29
+ transition: opacity 0.2s !important;
30
+ }
31
+
32
+ /* ── Selection box ── */
33
+ #glmocr-selbox {
34
+ position: fixed !important;
35
+ display: none !important;
36
+ border: 2px solid #c94a1f !important;
37
+ background: rgba(201, 74, 31, 0.08) !important;
38
+ box-shadow: 0 0 0 9999px rgba(0, 0, 0, 0.35) !important;
39
+ pointer-events: none !important;
40
+ z-index: 2147483647 !important;
41
+ }
42
+
43
+ /* ── Sidebar iframe ── */
44
+ #glmocr-sidebar {
45
+ position: fixed !important;
46
+ top: 0 !important;
47
+ right: 0 !important;
48
+ width: 380px !important;
49
+ height: 100vh !important;
50
+ border: none !important;
51
+ z-index: 2147483645 !important;
52
+ border-left: 2px solid #d4cfc3 !important;
53
+ box-shadow: -4px 0 24px rgba(0,0,0,0.12) !important;
54
+ animation: glmocr-slideIn 0.25s cubic-bezier(0.22, 1, 0.36, 1) !important;
55
+ }
56
+
57
+ @keyframes glmocr-slideIn {
58
+ from { transform: translateX(100%); opacity: 0; }
59
+ to { transform: translateX(0); opacity: 1; }
60
+ }
61
+
62
+ /* ── Toast ── */
63
+ #glmocr-toast {
64
+ position: fixed !important;
65
+ bottom: 24px !important;
66
+ left: 50% !important;
67
+ transform: translateX(-50%) !important;
68
+ background: #0f0e0d !important;
69
+ color: #f5f0e8 !important;
70
+ font-family: 'IBM Plex Mono', monospace, monospace !important;
71
+ font-size: 13px !important;
72
+ padding: 10px 20px !important;
73
+ border-radius: 4px !important;
74
+ z-index: 2147483647 !important;
75
+ white-space: nowrap !important;
76
+ animation: glmocr-fadeUp 0.3s ease both !important;
77
+ }
78
+
79
+ @keyframes glmocr-fadeUp {
80
+ from { opacity: 0; transform: translateX(-50%) translateY(12px); }
81
+ to { opacity: 1; transform: translateX(-50%) translateY(0); }
82
+ }
Extension/content.js ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // content.js β€” Injected into every page
2
+ // Manages the screen selection overlay and sidebar panel
3
+
4
+ let overlayActive = false;
5
+ let sidebarFrame = null;
6
+
7
+ // ── Listen for messages from background / popup ───────────────────────────────
8
+
9
+ chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
10
+
11
+ if (msg.type === "START_SELECTION") {
12
+ if (!overlayActive) startSelection();
13
+ sendResponse({ ok: true });
14
+ return false;
15
+ }
16
+
17
+ if (msg.type === "SHOW_SIDEBAR") {
18
+ showSidebar({});
19
+ return false;
20
+ }
21
+
22
+ if (msg.type === "SHOW_RESULT") {
23
+ showSidebar(msg.data);
24
+ return false;
25
+ }
26
+
27
+ });
28
+
29
+ // ── Selection overlay ─────────────────────────────────────────────────────────
30
+
31
+ function startSelection() {
32
+ overlayActive = true;
33
+
34
+ // Dim the page
35
+ const overlay = document.createElement("div");
36
+ overlay.id = "glmocr-overlay";
37
+
38
+ // Crosshair hint
39
+ const hint = document.createElement("div");
40
+ hint.id = "glmocr-hint";
41
+ hint.textContent = "Drag to select text region β€” Press Esc to cancel";
42
+ overlay.appendChild(hint);
43
+
44
+ // Selection box
45
+ const selBox = document.createElement("div");
46
+ selBox.id = "glmocr-selbox";
47
+ overlay.appendChild(selBox);
48
+
49
+ document.body.appendChild(overlay);
50
+
51
+ let startX = 0, startY = 0, isDragging = false;
52
+
53
+ function onMouseDown(e) {
54
+ if (e.button !== 0) return;
55
+ isDragging = true;
56
+ startX = e.clientX;
57
+ startY = e.clientY;
58
+ selBox.style.cssText = `left:${startX}px; top:${startY}px; width:0; height:0; display:block`;
59
+ hint.style.opacity = "0";
60
+ e.preventDefault();
61
+ }
62
+
63
+ function onMouseMove(e) {
64
+ if (!isDragging) return;
65
+ const x = Math.min(e.clientX, startX);
66
+ const y = Math.min(e.clientY, startY);
67
+ const w = Math.abs(e.clientX - startX);
68
+ const h = Math.abs(e.clientY - startY);
69
+ selBox.style.cssText = `left:${x}px; top:${y}px; width:${w}px; height:${h}px; display:block`;
70
+ }
71
+
72
+ function onMouseUp(e) {
73
+ if (!isDragging) return;
74
+ isDragging = false;
75
+
76
+ const x = Math.min(e.clientX, startX);
77
+ const y = Math.min(e.clientY, startY);
78
+ const w = Math.abs(e.clientX - startX);
79
+ const h = Math.abs(e.clientY - startY);
80
+
81
+ cleanup();
82
+
83
+ if (w < 10 || h < 10) {
84
+ showToast("Selection too small β€” try again.");
85
+ return;
86
+ }
87
+
88
+ const dpr = window.devicePixelRatio || 1;
89
+ const rect = {
90
+ x: x + window.scrollX,
91
+ y: y + window.scrollY,
92
+ width: w,
93
+ height: h,
94
+ dpr,
95
+ };
96
+
97
+ runOcr(rect);
98
+ }
99
+
100
+ function onKeyDown(e) {
101
+ if (e.key === "Escape") {
102
+ cleanup();
103
+ showToast("Cancelled.");
104
+ }
105
+ }
106
+
107
+ function cleanup() {
108
+ overlayActive = false;
109
+ overlay.removeEventListener("mousedown", onMouseDown);
110
+ overlay.removeEventListener("mousemove", onMouseMove);
111
+ overlay.removeEventListener("mouseup", onMouseUp);
112
+ document.removeEventListener("keydown", onKeyDown);
113
+ overlay.remove();
114
+ }
115
+
116
+ overlay.addEventListener("mousedown", onMouseDown);
117
+ overlay.addEventListener("mousemove", onMouseMove);
118
+ overlay.addEventListener("mouseup", onMouseUp);
119
+ document.addEventListener("keydown", onKeyDown);
120
+ }
121
+
122
+ // ── Send region to background for capture + OCR ───────────────────────────────
123
+
124
+ function runOcr(rect) {
125
+ // Show a loading sidebar immediately
126
+ showSidebar({ loading: true });
127
+
128
+ chrome.runtime.sendMessage({ type: "CAPTURE_REGION", rect }, (response) => {
129
+ if (chrome.runtime.lastError) {
130
+ showSidebar({ error: chrome.runtime.lastError.message });
131
+ return;
132
+ }
133
+ if (response.success) {
134
+ showSidebar(response);
135
+ } else {
136
+ showSidebar({ error: response.error });
137
+ }
138
+ });
139
+ }
140
+
141
+ // ── Sidebar panel ─────────────────────────────────────────────────────────────
142
+
143
+ function showSidebar(data) {
144
+ // Remove existing sidebar if any
145
+ if (sidebarFrame) sidebarFrame.remove();
146
+
147
+ const frame = document.createElement("iframe");
148
+ frame.id = "glmocr-sidebar";
149
+ frame.src = chrome.runtime.getURL("sidebar.html");
150
+
151
+ document.body.appendChild(frame);
152
+ sidebarFrame = frame;
153
+
154
+ // Wait for iframe to load, then send data
155
+ frame.onload = () => {
156
+ frame.contentWindow.postMessage({ type: "SIDEBAR_DATA", data }, "*");
157
+ };
158
+
159
+ // Close button via message from sidebar
160
+ window.addEventListener("message", (e) => {
161
+ if (e.data?.type === "CLOSE_SIDEBAR") {
162
+ frame.remove();
163
+ sidebarFrame = null;
164
+ }
165
+ if (e.data?.type === "START_NEW_SELECTION") {
166
+ frame.remove();
167
+ sidebarFrame = null;
168
+ startSelection();
169
+ }
170
+ });
171
+ }
172
+
173
+ // ── Toast notification ────────────────────────────────────────────────────────
174
+
175
+ function showToast(msg) {
176
+ const existing = document.getElementById("glmocr-toast");
177
+ if (existing) existing.remove();
178
+
179
+ const toast = document.createElement("div");
180
+ toast.id = "glmocr-toast";
181
+ toast.textContent = msg;
182
+ document.body.appendChild(toast);
183
+ setTimeout(() => toast?.remove(), 3000);
184
+ }
Extension/generate_icons.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ generate_icons.py β€” Run this once to create the extension icons.
3
+ Requires Pillow: pip install Pillow
4
+ """
5
+ from PIL import Image, ImageDraw, ImageFont
6
+ import os
7
+
8
+ os.makedirs("icons", exist_ok=True)
9
+
10
+ def make_icon(size):
11
+ img = Image.new("RGBA", (size, size), (0, 0, 0, 0))
12
+ draw = ImageDraw.Draw(img)
13
+
14
+ # Background rounded rect
15
+ pad = size // 8
16
+ draw.rounded_rectangle(
17
+ [pad, pad, size - pad, size - pad],
18
+ radius=size // 5,
19
+ fill="#c94a1f"
20
+ )
21
+
22
+ # Letter "G" for GLM
23
+ font_size = int(size * 0.52)
24
+ try:
25
+ font = ImageFont.truetype("arial.ttf", font_size)
26
+ except:
27
+ font = ImageFont.load_default()
28
+
29
+ text = "G"
30
+ bbox = draw.textbbox((0, 0), text, font=font)
31
+ tw = bbox[2] - bbox[0]
32
+ th = bbox[3] - bbox[1]
33
+ tx = (size - tw) // 2 - bbox[0]
34
+ ty = (size - th) // 2 - bbox[1]
35
+ draw.text((tx, ty), text, fill="white", font=font)
36
+
37
+ img.save(f"icons/icon{size}.png")
38
+ print(f"Created icons/icon{size}.png")
39
+
40
+ for s in [16, 48, 128]:
41
+ make_icon(s)
42
+
43
+ print("Done. Icons created in icons/")
Extension/icons/icon128.png ADDED
Extension/icons/icon16.png ADDED
Extension/icons/icon48.png ADDED
Extension/manifest.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "manifest_version": 3,
3
+ "name": "GLM-OCR β€” Text from Screen",
4
+ "version": "1.0.0",
5
+ "description": "Select any region on screen and extract text using the self-hosted GLM-OCR model.",
6
+
7
+ "permissions": [
8
+ "activeTab",
9
+ "scripting",
10
+ "tabs",
11
+ "storage"
12
+ ],
13
+
14
+ "host_permissions": [
15
+ "http://localhost:8000/*",
16
+ "<all_urls>"
17
+ ],
18
+
19
+ "background": {
20
+ "service_worker": "background.js"
21
+ },
22
+
23
+ "action": {
24
+ "default_popup": "popup.html",
25
+ "default_icon": {
26
+ "16": "icons/icon16.png",
27
+ "48": "icons/icon48.png",
28
+ "128": "icons/icon128.png"
29
+ }
30
+ },
31
+
32
+ "content_scripts": [
33
+ {
34
+ "matches": ["<all_urls>"],
35
+ "js": ["content.js"],
36
+ "css": ["content.css"],
37
+ "run_at": "document_idle",
38
+ "all_frames": false
39
+ }
40
+ ],
41
+
42
+ "icons": {
43
+ "16": "icons/icon16.png",
44
+ "48": "icons/icon48.png",
45
+ "128": "icons/icon128.png"
46
+ },
47
+
48
+ "web_accessible_resources": [
49
+ {
50
+ "resources": ["sidebar.html"],
51
+ "matches": ["<all_urls>"]
52
+ }
53
+ ]
54
+ }
Extension/popup.html ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>GLM-OCR</title>
7
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;700&family=DM+Serif+Display:ital@0;1&family=DM+Sans:wght@400;500&display=swap" rel="stylesheet"/>
8
+ <style>
9
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
10
+ :root {
11
+ --ink: #0f0e0d;
12
+ --paper: #f5f0e8;
13
+ --warm: #ede8dc;
14
+ --border: #d4cfc3;
15
+ --muted: #8f8880;
16
+ --accent: #c94a1f;
17
+ --green: #1a6b4a;
18
+ --mono: 'IBM Plex Mono', monospace;
19
+ --serif: 'DM Serif Display', serif;
20
+ --sans: 'DM Sans', sans-serif;
21
+ }
22
+
23
+ body {
24
+ width: 300px;
25
+ background: var(--paper);
26
+ color: var(--ink);
27
+ font-family: var(--sans);
28
+ }
29
+
30
+ body::before {
31
+ content: '';
32
+ position: fixed; inset: 0;
33
+ background-image: radial-gradient(circle, rgba(0,0,0,0.05) 1px, transparent 1px);
34
+ background-size: 16px 16px;
35
+ pointer-events: none;
36
+ }
37
+
38
+ .inner { position: relative; }
39
+
40
+ /* Header */
41
+ .header {
42
+ padding: 16px 18px 14px;
43
+ border-bottom: 2px solid var(--ink);
44
+ display: flex;
45
+ align-items: center;
46
+ justify-content: space-between;
47
+ }
48
+
49
+ .logo {
50
+ font-family: var(--serif);
51
+ font-size: 1.1rem;
52
+ letter-spacing: -0.01em;
53
+ }
54
+
55
+ .logo em { font-style: italic; color: var(--accent); }
56
+
57
+ .server-badge {
58
+ display: flex;
59
+ align-items: center;
60
+ gap: 5px;
61
+ font-family: var(--mono);
62
+ font-size: 0.58rem;
63
+ color: var(--muted);
64
+ letter-spacing: 0.06em;
65
+ }
66
+
67
+ .dot {
68
+ width: 6px; height: 6px;
69
+ border-radius: 50%;
70
+ background: var(--muted);
71
+ }
72
+ .dot.ok { background: var(--green); }
73
+ .dot.err { background: var(--accent); }
74
+ .dot.pulse { animation: blink 1.2s ease-in-out infinite; }
75
+ @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} }
76
+
77
+ /* Main CTA */
78
+ .cta-area {
79
+ padding: 20px 18px;
80
+ border-bottom: 1px solid var(--border);
81
+ }
82
+
83
+ .cta-label {
84
+ font-family: var(--mono);
85
+ font-size: 0.62rem;
86
+ color: var(--muted);
87
+ letter-spacing: 0.1em;
88
+ text-transform: uppercase;
89
+ margin-bottom: 10px;
90
+ }
91
+
92
+ .select-btn {
93
+ width: 100%;
94
+ padding: 14px;
95
+ background: var(--accent);
96
+ color: white;
97
+ border: none;
98
+ border-radius: 2px;
99
+ font-family: var(--serif);
100
+ font-size: 1rem;
101
+ cursor: pointer;
102
+ transition: background 0.15s;
103
+ display: flex;
104
+ align-items: center;
105
+ justify-content: center;
106
+ gap: 8px;
107
+ }
108
+
109
+ .select-btn:hover:not(:disabled) { background: #b53d15; }
110
+ .select-btn:disabled { opacity: 0.35; cursor: not-allowed; }
111
+
112
+ .select-btn .shortcut {
113
+ font-family: var(--mono);
114
+ font-size: 0.6rem;
115
+ opacity: 0.7;
116
+ margin-left: auto;
117
+ }
118
+
119
+ .offline-msg {
120
+ display: none;
121
+ margin-top: 10px;
122
+ font-family: var(--mono);
123
+ font-size: 0.65rem;
124
+ color: var(--accent);
125
+ line-height: 1.6;
126
+ }
127
+
128
+ .offline-msg.show { display: block; }
129
+
130
+ .offline-msg a {
131
+ color: var(--accent);
132
+ text-decoration: underline;
133
+ }
134
+
135
+ /* How it works */
136
+ .how {
137
+ padding: 16px 18px;
138
+ border-bottom: 1px solid var(--border);
139
+ }
140
+
141
+ .how-title {
142
+ font-family: var(--mono);
143
+ font-size: 0.6rem;
144
+ color: var(--muted);
145
+ letter-spacing: 0.1em;
146
+ text-transform: uppercase;
147
+ margin-bottom: 12px;
148
+ }
149
+
150
+ .step {
151
+ display: flex;
152
+ gap: 10px;
153
+ align-items: flex-start;
154
+ margin-bottom: 8px;
155
+ }
156
+
157
+ .step:last-child { margin-bottom: 0; }
158
+
159
+ .step-num {
160
+ font-family: var(--mono);
161
+ font-size: 0.6rem;
162
+ color: var(--accent);
163
+ font-weight: 700;
164
+ flex-shrink: 0;
165
+ margin-top: 2px;
166
+ }
167
+
168
+ .step-text {
169
+ font-size: 0.78rem;
170
+ line-height: 1.5;
171
+ color: var(--ink);
172
+ }
173
+
174
+ /* Settings */
175
+ .settings {
176
+ padding: 14px 18px;
177
+ }
178
+
179
+ .settings-title {
180
+ font-family: var(--mono);
181
+ font-size: 0.6rem;
182
+ color: var(--muted);
183
+ letter-spacing: 0.1em;
184
+ text-transform: uppercase;
185
+ margin-bottom: 10px;
186
+ }
187
+
188
+ .setting-row {
189
+ display: flex;
190
+ align-items: center;
191
+ justify-content: space-between;
192
+ margin-bottom: 8px;
193
+ }
194
+
195
+ .setting-label {
196
+ font-family: var(--mono);
197
+ font-size: 0.68rem;
198
+ color: var(--ink);
199
+ }
200
+
201
+ .mode-toggle {
202
+ display: flex;
203
+ gap: 4px;
204
+ }
205
+
206
+ .mode-opt {
207
+ font-family: var(--mono);
208
+ font-size: 0.58rem;
209
+ padding: 4px 8px;
210
+ border: 1px solid var(--border);
211
+ background: transparent;
212
+ color: var(--muted);
213
+ cursor: pointer;
214
+ border-radius: 2px;
215
+ transition: all 0.12s;
216
+ }
217
+
218
+ .mode-opt.active {
219
+ background: var(--ink);
220
+ border-color: var(--ink);
221
+ color: var(--paper);
222
+ }
223
+
224
+ /* Footer */
225
+ .footer {
226
+ padding: 10px 18px;
227
+ border-top: 1px solid var(--border);
228
+ font-family: var(--mono);
229
+ font-size: 0.58rem;
230
+ color: var(--muted);
231
+ display: flex;
232
+ justify-content: space-between;
233
+ }
234
+ </style>
235
+ </head>
236
+ <body>
237
+ <div class="inner">
238
+
239
+ <!-- Header -->
240
+ <div class="header">
241
+ <div class="logo">GLM-<em>OCR</em></div>
242
+ <div class="server-badge">
243
+ <div class="dot pulse" id="dot"></div>
244
+ <span id="server-label">checking…</span>
245
+ </div>
246
+ </div>
247
+
248
+ <!-- CTA -->
249
+ <div class="cta-area">
250
+ <div class="cta-label">Select region on screen</div>
251
+ <button class="select-btn" id="select-btn" disabled>
252
+ βœ‚ &nbsp;Select & Extract Text
253
+ </button>
254
+ <div class="offline-msg" id="offline-msg">
255
+ ⚠ GLM-OCR server not running.<br>
256
+ Start it with <code>python main.py</code> at <a href="http://localhost:8000" target="_blank">localhost:8000</a>.
257
+ </div>
258
+ </div>
259
+
260
+ <!-- How it works -->
261
+ <div class="how">
262
+ <div class="how-title">How it works</div>
263
+ <div class="step">
264
+ <div class="step-num">01</div>
265
+ <div class="step-text">Click the button above β€” page dims</div>
266
+ </div>
267
+ <div class="step">
268
+ <div class="step-num">02</div>
269
+ <div class="step-text">Drag a box around the text you want</div>
270
+ </div>
271
+ <div class="step">
272
+ <div class="step-num">03</div>
273
+ <div class="step-text">GLM-OCR extracts text into a sidebar</div>
274
+ </div>
275
+ <div class="step">
276
+ <div class="step-num">04</div>
277
+ <div class="step-text">Copy or download the result</div>
278
+ </div>
279
+ </div>
280
+
281
+ <!-- Settings -->
282
+ <div class="settings">
283
+ <div class="settings-title">Settings</div>
284
+ <div class="setting-row">
285
+ <span class="setting-label">OCR Mode</span>
286
+ <div class="mode-toggle">
287
+ <button class="mode-opt active" data-mode="recognize">recognize</button>
288
+ <button class="mode-opt" data-mode="parse">parse</button>
289
+ </div>
290
+ </div>
291
+ </div>
292
+
293
+ <div class="footer">
294
+ <span>zai-org/GLM-OCR Β· 0.9B</span>
295
+ <span>self-hosted</span>
296
+ </div>
297
+
298
+ </div>
299
+
300
+ <script src="popup.js"></script>
301
+ </body>
302
+ </html>
Extension/popup.js ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // popup.js
2
+
3
+ const selectBtn = document.getElementById("select-btn");
4
+ const dot = document.getElementById("dot");
5
+ const serverLabel = document.getElementById("server-label");
6
+ const offlineMsg = document.getElementById("offline-msg");
7
+
8
+ let selectedMode = "recognize";
9
+
10
+ // ── Check server health ───────────────────────────────────────────────────────
11
+ async function checkServer() {
12
+ try {
13
+ const r = await fetch("http://localhost:8000/health", {
14
+ signal: AbortSignal.timeout(3000),
15
+ });
16
+ const d = await r.json();
17
+ return d.status === "ok";
18
+ } catch {
19
+ return false;
20
+ }
21
+ }
22
+
23
+ async function updateServerStatus() {
24
+ const ok = await checkServer();
25
+ dot.className = `dot ${ok ? "ok" : "err"}`;
26
+ serverLabel.textContent = ok ? "server ready" : "offline";
27
+ selectBtn.disabled = !ok;
28
+ offlineMsg.classList.toggle("show", !ok);
29
+ }
30
+
31
+ updateServerStatus();
32
+
33
+ // ── Mode toggle ───────────────────────────────────────────────────────────────
34
+ document.querySelectorAll(".mode-opt").forEach(btn => {
35
+ btn.addEventListener("click", () => {
36
+ document.querySelectorAll(".mode-opt").forEach(b => b.classList.remove("active"));
37
+ btn.classList.add("active");
38
+ selectedMode = btn.dataset.mode;
39
+ chrome.storage.local.set({ ocrMode: selectedMode });
40
+ });
41
+ });
42
+
43
+ // Restore saved mode
44
+ chrome.storage.local.get(["ocrMode"], ({ ocrMode }) => {
45
+ if (ocrMode) {
46
+ selectedMode = ocrMode;
47
+ document.querySelectorAll(".mode-opt").forEach(btn => {
48
+ btn.classList.toggle("active", btn.dataset.mode === ocrMode);
49
+ });
50
+ }
51
+ });
52
+
53
+ // ── Select button ─────────────────────────────────────────────────────────────
54
+ selectBtn.addEventListener("click", async () => {
55
+ // Save current mode to storage so background can read it
56
+ await chrome.storage.local.set({ ocrMode: selectedMode });
57
+
58
+ // Get current tab and inject the selection
59
+ const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
60
+
61
+ await chrome.scripting.executeScript({
62
+ target: { tabId: tab.id },
63
+ func: () => {
64
+ window.postMessage({ type: "GLMOCR_START" }, "*");
65
+ },
66
+ });
67
+
68
+ // Tell content script to start selection mode
69
+ chrome.tabs.sendMessage(tab.id, { type: "START_SELECTION" });
70
+
71
+ // Close popup so it doesn't obscure the page
72
+ window.close();
73
+ });
Extension/sidebar.html ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>GLM-OCR Result</title>
7
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;700&family=DM+Serif+Display:ital@0;1&family=DM+Sans:wght@400;500&display=swap" rel="stylesheet"/>
8
+ <style>
9
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
10
+ :root {
11
+ --ink: #0f0e0d;
12
+ --paper: #f5f0e8;
13
+ --warm: #ede8dc;
14
+ --border: #d4cfc3;
15
+ --muted: #8f8880;
16
+ --accent: #c94a1f;
17
+ --green: #1a6b4a;
18
+ --mono: 'IBM Plex Mono', monospace;
19
+ --serif: 'DM Serif Display', serif;
20
+ --sans: 'DM Sans', sans-serif;
21
+ }
22
+
23
+ html, body {
24
+ height: 100%;
25
+ background: var(--paper);
26
+ color: var(--ink);
27
+ font-family: var(--sans);
28
+ overflow: hidden;
29
+ }
30
+
31
+ body::before {
32
+ content: '';
33
+ position: fixed; inset: 0;
34
+ background-image: radial-gradient(circle, rgba(0,0,0,0.05) 1px, transparent 1px);
35
+ background-size: 16px 16px;
36
+ pointer-events: none;
37
+ }
38
+
39
+ .sidebar {
40
+ position: relative;
41
+ height: 100vh;
42
+ display: flex;
43
+ flex-direction: column;
44
+ }
45
+
46
+ /* ── Header ── */
47
+ .sb-header {
48
+ padding: 14px 16px;
49
+ border-bottom: 2px solid var(--ink);
50
+ display: flex;
51
+ align-items: center;
52
+ justify-content: space-between;
53
+ flex-shrink: 0;
54
+ }
55
+
56
+ .sb-title {
57
+ font-family: var(--serif);
58
+ font-size: 1rem;
59
+ letter-spacing: -0.01em;
60
+ }
61
+
62
+ .sb-title em { font-style: italic; color: var(--accent); }
63
+
64
+ .sb-close {
65
+ font-family: var(--mono);
66
+ font-size: 0.6rem;
67
+ padding: 5px 10px;
68
+ border: 1px solid var(--border);
69
+ background: transparent;
70
+ cursor: pointer;
71
+ border-radius: 2px;
72
+ color: var(--muted);
73
+ transition: all 0.12s;
74
+ }
75
+
76
+ .sb-close:hover { border-color: var(--ink); color: var(--ink); }
77
+
78
+ /* ── Scrollable body ── */
79
+ .sb-body {
80
+ flex: 1;
81
+ overflow-y: auto;
82
+ display: flex;
83
+ flex-direction: column;
84
+ }
85
+
86
+ /* ── Loading ── */
87
+ .sb-loading {
88
+ flex: 1;
89
+ display: flex;
90
+ flex-direction: column;
91
+ align-items: center;
92
+ justify-content: center;
93
+ gap: 16px;
94
+ padding: 24px;
95
+ }
96
+
97
+ .scan-bar-wrap { width: 140px; height: 3px; background: var(--border); border-radius: 2px; overflow: hidden; }
98
+ .scan-bar { height: 100%; background: var(--accent); border-radius: 2px; animation: scan 1.4s ease-in-out infinite; }
99
+ @keyframes scan { 0%{transform:translateX(-100%)} 50%{transform:translateX(0)} 100%{transform:translateX(100%)} }
100
+
101
+ .loading-label {
102
+ font-family: var(--mono);
103
+ font-size: 0.68rem;
104
+ color: var(--muted);
105
+ animation: blink 1.4s ease-in-out infinite;
106
+ }
107
+
108
+ @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} }
109
+
110
+ /* ── Error ── */
111
+ .sb-error {
112
+ margin: 16px;
113
+ background: #fff0f0;
114
+ border: 1px solid rgba(201,74,31,0.3);
115
+ border-radius: 2px;
116
+ padding: 14px;
117
+ font-family: var(--mono);
118
+ font-size: 0.72rem;
119
+ color: var(--accent);
120
+ line-height: 1.7;
121
+ }
122
+
123
+ /* ── Image preview ── */
124
+ .sb-image-wrap {
125
+ padding: 14px 16px 0;
126
+ flex-shrink: 0;
127
+ }
128
+
129
+ .sb-image-label {
130
+ font-family: var(--mono);
131
+ font-size: 0.58rem;
132
+ color: var(--muted);
133
+ letter-spacing: 0.1em;
134
+ text-transform: uppercase;
135
+ margin-bottom: 8px;
136
+ }
137
+
138
+ .sb-image {
139
+ width: 100%;
140
+ max-height: 160px;
141
+ object-fit: contain;
142
+ border: 1px solid var(--border);
143
+ border-radius: 2px;
144
+ background: var(--warm);
145
+ }
146
+
147
+ /* ── Meta chips ── */
148
+ .sb-meta {
149
+ padding: 10px 16px;
150
+ display: flex;
151
+ gap: 10px;
152
+ flex-wrap: wrap;
153
+ border-bottom: 1px solid var(--border);
154
+ flex-shrink: 0;
155
+ }
156
+
157
+ .chip {
158
+ font-family: var(--mono);
159
+ font-size: 0.6rem;
160
+ color: var(--muted);
161
+ }
162
+
163
+ .chip strong { color: var(--green); }
164
+
165
+ /* ── Extracted text ── */
166
+ .sb-text-section {
167
+ padding: 14px 16px;
168
+ display: flex;
169
+ flex-direction: column;
170
+ gap: 8px;
171
+ flex: 1;
172
+ min-height: 0;
173
+ }
174
+
175
+ .sb-text-label {
176
+ font-family: var(--mono);
177
+ font-size: 0.58rem;
178
+ color: var(--muted);
179
+ letter-spacing: 0.1em;
180
+ text-transform: uppercase;
181
+ flex-shrink: 0;
182
+ }
183
+
184
+ .sb-text {
185
+ background: var(--warm);
186
+ border: 1px solid var(--border);
187
+ border-radius: 2px;
188
+ padding: 14px;
189
+ font-family: var(--mono);
190
+ font-size: 0.78rem;
191
+ line-height: 1.85;
192
+ white-space: pre-wrap;
193
+ word-break: break-word;
194
+ overflow-y: auto;
195
+ flex: 1;
196
+ min-height: 120px;
197
+ }
198
+
199
+ /* ── Actions ── */
200
+ .sb-actions {
201
+ padding: 12px 16px;
202
+ border-top: 1px solid var(--border);
203
+ display: flex;
204
+ gap: 8px;
205
+ flex-shrink: 0;
206
+ }
207
+
208
+ .action-btn {
209
+ font-family: var(--mono);
210
+ font-size: 0.62rem;
211
+ letter-spacing: 0.04em;
212
+ padding: 9px 12px;
213
+ border: 1px solid var(--border);
214
+ background: transparent;
215
+ color: var(--ink);
216
+ cursor: pointer;
217
+ border-radius: 2px;
218
+ transition: all 0.12s;
219
+ flex: 1;
220
+ }
221
+
222
+ .action-btn:hover { border-color: var(--ink); }
223
+
224
+ .action-btn.primary {
225
+ background: var(--accent);
226
+ border-color: var(--accent);
227
+ color: white;
228
+ }
229
+
230
+ .action-btn.primary:hover { background: #b53d15; }
231
+
232
+ /* ── Toast ── */
233
+ .toast {
234
+ position: fixed;
235
+ bottom: 16px;
236
+ left: 50%;
237
+ transform: translateX(-50%) translateY(40px);
238
+ opacity: 0;
239
+ background: var(--ink);
240
+ color: var(--paper);
241
+ font-family: var(--mono);
242
+ font-size: 0.65rem;
243
+ padding: 8px 16px;
244
+ border-radius: 2px;
245
+ white-space: nowrap;
246
+ transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
247
+ z-index: 999;
248
+ }
249
+
250
+ .toast.show {
251
+ transform: translateX(-50%) translateY(0);
252
+ opacity: 1;
253
+ }
254
+ </style>
255
+ </head>
256
+ <body>
257
+ <div class="sidebar">
258
+
259
+ <!-- Header -->
260
+ <div class="sb-header">
261
+ <div class="sb-title">GLM-<em>OCR</em> Result</div>
262
+ <button class="sb-close" id="close-btn">βœ• Close</button>
263
+ </div>
264
+
265
+ <!-- Body -->
266
+ <div class="sb-body" id="sb-body">
267
+
268
+ <!-- Loading state (default) -->
269
+ <div class="sb-loading" id="state-loading">
270
+ <div class="scan-bar-wrap"><div class="scan-bar"></div></div>
271
+ <div class="loading-label">Running GLM-OCR…</div>
272
+ </div>
273
+
274
+ </div>
275
+
276
+ <!-- Actions (shown after result) -->
277
+ <div class="sb-actions" id="sb-actions" style="display:none">
278
+ <button class="action-btn primary" id="new-btn">βœ‚ New Selection</button>
279
+ <button class="action-btn" id="copy-btn">Copy</button>
280
+ <button class="action-btn" id="dl-btn">↓ .txt</button>
281
+ </div>
282
+
283
+ </div>
284
+
285
+ <div class="toast" id="toast"></div>
286
+
287
+ <script>
288
+ let extractedText = "";
289
+
290
+ // ── Receive data from content.js ──────────────────────────────────────────
291
+ window.addEventListener("message", (e) => {
292
+ if (e.data?.type !== "SIDEBAR_DATA") return;
293
+ const data = e.data.data;
294
+
295
+ if (data.loading) return; // already showing loading state
296
+
297
+ renderResult(data);
298
+ });
299
+
300
+ function renderResult(data) {
301
+ const body = document.getElementById("sb-body");
302
+ const actions = document.getElementById("sb-actions");
303
+
304
+ if (data.error) {
305
+ body.innerHTML = `<div class="sb-error">⚠ ${data.error}<br><br>Make sure the GLM-OCR server is running at localhost:8000.</div>`;
306
+ actions.style.display = "flex";
307
+ return;
308
+ }
309
+
310
+ extractedText = data.text || "";
311
+
312
+ const latency = data.latency_ms ? `${(data.latency_ms / 1000).toFixed(2)}s` : "β€”";
313
+
314
+ body.innerHTML = `
315
+ <div class="sb-image-wrap">
316
+ <div class="sb-image-label">Selected Region</div>
317
+ <img class="sb-image" src="${data.imageDataUrl || ''}" alt="Selection"/>
318
+ </div>
319
+
320
+ <div class="sb-meta">
321
+ <span class="chip">words: <strong>${data.word_count || 0}</strong></span>
322
+ <span class="chip">chars: <strong>${data.char_count || 0}</strong></span>
323
+ <span class="chip">latency: <strong>${latency}</strong></span>
324
+ <span class="chip">device: <strong>${data.device || 'β€”'}</strong></span>
325
+ </div>
326
+
327
+ <div class="sb-text-section">
328
+ <div class="sb-text-label">Extracted Text</div>
329
+ <div class="sb-text" id="result-text">${data.text ? escapeHtml(data.text) : '<span style="color:var(--muted);">[No text detected]</span>'}</div>
330
+ </div>
331
+ `;
332
+
333
+ actions.style.display = "flex";
334
+ }
335
+
336
+ function escapeHtml(str) {
337
+ return str
338
+ .replace(/&/g, "&amp;")
339
+ .replace(/</g, "&lt;")
340
+ .replace(/>/g, "&gt;");
341
+ }
342
+
343
+ // ── Close ─────────────────────────────────────────────────────────────────
344
+ document.getElementById("close-btn").addEventListener("click", () => {
345
+ window.parent.postMessage({ type: "CLOSE_SIDEBAR" }, "*");
346
+ });
347
+
348
+ // ── New selection ─────────────────────────────────────────────────────────
349
+ document.getElementById("new-btn").addEventListener("click", () => {
350
+ window.parent.postMessage({ type: "START_NEW_SELECTION" }, "*");
351
+ });
352
+
353
+ // ── Copy ───────────────────────────────────────────────────────────��──────
354
+ document.getElementById("copy-btn").addEventListener("click", async () => {
355
+ try {
356
+ await navigator.clipboard.writeText(extractedText);
357
+ toast("Copied!");
358
+ } catch {
359
+ // fallback: select all text in the result box
360
+ const el = document.getElementById("result-text");
361
+ if (el) {
362
+ const range = document.createRange();
363
+ range.selectNodeContents(el);
364
+ const sel = window.getSelection();
365
+ sel.removeAllRanges();
366
+ sel.addRange(range);
367
+ }
368
+ toast("Select text above and copy manually.");
369
+ }
370
+ });
371
+
372
+ // ── Download ──────────────────────────────────────────────────────────────
373
+ document.getElementById("dl-btn").addEventListener("click", () => {
374
+ const blob = new Blob([extractedText], { type: "text/plain" });
375
+ const a = document.createElement("a");
376
+ a.href = URL.createObjectURL(blob);
377
+ a.download = `glm-ocr-${Date.now()}.txt`;
378
+ a.click();
379
+ URL.revokeObjectURL(a.href);
380
+ });
381
+
382
+ // ── Toast ─────────────────────────────────────────────────────────────────
383
+ function toast(msg) {
384
+ const t = document.getElementById("toast");
385
+ t.textContent = msg;
386
+ t.classList.add("show");
387
+ setTimeout(() => t.classList.remove("show"), 2000);
388
+ }
389
+ </script>
390
+ </body>
391
+ </html>
README.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GLM OCR
3
+ emoji: πŸ“„
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
+ # GOT-OCR 2.0 β€” Self-Hosted OCR Engine
12
+
13
+ > A full-stack portfolio project: self-hosted OCR backend powered by **General OCR Theory (GOT-OCR 2.0)**, a 580M-param vision-language model trained end-to-end for document understanding.
14
+
15
+ ---
16
+
17
+ ## What is GOT-OCR 2.0?
18
+
19
+ GOT-OCR 2.0 is a state-of-the-art open-source OCR model from the paper
20
+ **"General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model"** (arXiv:2409.01704).
21
+
22
+ Unlike traditional OCR (Tesseract, etc.) it uses a **vision encoder + language model** architecture:
23
+
24
+ ```
25
+ Image β†’ [CLIP-style Vision Encoder] β†’ [Qwen2 LM Backbone] β†’ Text
26
+ ```
27
+
28
+ It handles:
29
+ - Plain text from documents, screenshots, photos
30
+ - **Tables** (preserved structure)
31
+ - **Mathematical equations** (LaTeX output)
32
+ - **Code blocks** (syntax preserved)
33
+ - **Multilingual** text
34
+ - **Handwriting**
35
+
36
+ ---
37
+
38
+ ## Project Structure
39
+
40
+ ```
41
+ got-ocr-project/
42
+ β”œβ”€β”€ backend/
43
+ β”‚ β”œβ”€β”€ main.py # FastAPI server β€” routes, CORS, request handling
44
+ β”‚ β”œβ”€β”€ ocr_engine.py # Model loading, inference, OcrResult dataclass
45
+ β”‚ └── requirements.txt
46
+ β”œβ”€β”€ frontend/
47
+ β”‚ └── index.html # Single-file frontend (served by FastAPI)
48
+ β”œβ”€β”€ Dockerfile # CUDA + CPU build
49
+ β”œβ”€β”€ docker-compose.yml # One-command deployment
50
+ └── README.md
51
+ ```
52
+
53
+ ---
54
+
55
+ ## Quickstart
56
+
57
+ ### Option 1 β€” Docker (recommended)
58
+
59
+ ```bash
60
+ git clone https://github.com/YOUR_USERNAME/got-ocr-project
61
+ cd got-ocr-project
62
+
63
+ # CPU-only (comment out the `deploy` block in docker-compose.yml first)
64
+ docker compose up --build
65
+
66
+ # With GPU
67
+ docker compose up --build
68
+
69
+ # App is live at http://localhost:8000
70
+ ```
71
+
72
+ The first build downloads ~2GB of model weights β€” cached in a Docker volume afterward.
73
+
74
+ ---
75
+
76
+ ### Option 2 β€” Local Python (no Docker)
77
+
78
+ ```bash
79
+ # 1. Clone
80
+ git clone https://github.com/YOUR_USERNAME/got-ocr-project
81
+ cd got-ocr-project
82
+
83
+ # 2. Create virtualenv
84
+ python -m venv .venv
85
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
86
+
87
+ # 3. Install dependencies
88
+ pip install -r backend/requirements.txt
89
+
90
+ # 4. Run
91
+ cd backend
92
+ python main.py
93
+
94
+ # App is live at http://localhost:8000
95
+ ```
96
+
97
+ Model weights (~2 GB) download automatically from HuggingFace on first startup.
98
+
99
+ ---
100
+
101
+ ## API Reference
102
+
103
+ ### `POST /ocr`
104
+
105
+ Run OCR on an uploaded image.
106
+
107
+ **Request** β€” `multipart/form-data`
108
+
109
+ | Field | Type | Required | Description |
110
+ |--------|--------|----------|-------------|
111
+ | `file` | file | βœ… | Image file (PNG, JPG, WEBP, GIF, BMP, TIFF, max 20 MB) |
112
+ | `mode` | string | βœ— | `ocr` (default) for plain text Β· `format` for structured (Markdown/LaTeX) |
113
+
114
+ **Response** β€” `application/json`
115
+
116
+ ```json
117
+ {
118
+ "success": true,
119
+ "text": "Extracted text here...",
120
+ "word_count": 142,
121
+ "char_count": 863,
122
+ "latency_ms": 1240.5,
123
+ "mode": "ocr",
124
+ "model_id": "stepfun-ai/GOT-OCR2_0",
125
+ "device": "cuda"
126
+ }
127
+ ```
128
+
129
+ **Example β€” curl**
130
+ ```bash
131
+ curl -X POST http://localhost:8000/ocr \
132
+ -F "file=@document.png" \
133
+ -F "mode=ocr"
134
+ ```
135
+
136
+ **Example β€” Python**
137
+ ```python
138
+ import requests
139
+
140
+ with open("document.png", "rb") as f:
141
+ r = requests.post(
142
+ "http://localhost:8000/ocr",
143
+ files={"file": ("document.png", f, "image/png")},
144
+ data={"mode": "ocr"},
145
+ )
146
+
147
+ data = r.json()
148
+ print(data["text"])
149
+ ```
150
+
151
+ **Example β€” JavaScript fetch**
152
+ ```js
153
+ const formData = new FormData();
154
+ formData.append("file", imageFile);
155
+ formData.append("mode", "ocr");
156
+
157
+ const res = await fetch("http://localhost:8000/ocr", { method: "POST", body: formData });
158
+ const data = await res.json();
159
+ console.log(data.text);
160
+ ```
161
+
162
+ ---
163
+
164
+ ### `GET /health`
165
+
166
+ ```json
167
+ {
168
+ "status": "ok",
169
+ "model": {
170
+ "model_id": "stepfun-ai/GOT-OCR2_0",
171
+ "device": "cuda",
172
+ "loaded": true,
173
+ "gpu_name": "NVIDIA RTX 3090",
174
+ "gpu_memory_gb": 24.0
175
+ }
176
+ }
177
+ ```
178
+
179
+ ### `GET /metrics`
180
+
181
+ Session-level stats (resets on server restart).
182
+
183
+ ```json
184
+ {
185
+ "total_requests": 14,
186
+ "total_words_extracted": 3821,
187
+ "avg_latency_ms": 980.4,
188
+ "error_count": 0,
189
+ "uptime_seconds": 3620.1
190
+ }
191
+ ```
192
+
193
+ ---
194
+
195
+ ## Hardware Requirements
196
+
197
+ | Setup | VRAM / RAM | Latency |
198
+ |-------|------------|---------|
199
+ | NVIDIA GPU (8 GB+ VRAM) | ~4 GB VRAM | ~0.5 – 1.5 s/image |
200
+ | CPU (16 GB RAM) | ~3 GB RAM | ~8 – 20 s/image |
201
+ | Apple Silicon (MPS) | ~4 GB | ~3 – 6 s/image |
202
+
203
+ ---
204
+
205
+ ## Deployment to Cloud
206
+
207
+ ### Fly.io (GPU, pay-as-you-go)
208
+
209
+ ```bash
210
+ fly launch --name got-ocr
211
+ fly scale vm a100-40gb
212
+ fly deploy
213
+ ```
214
+
215
+ ### Render / Railway (CPU only)
216
+
217
+ Push to GitHub β†’ connect repo β†’ set start command:
218
+ ```
219
+ cd backend && uvicorn main:app --host 0.0.0.0 --port $PORT
220
+ ```
221
+
222
+ ### Vast.ai / RunPod (cheapest GPU)
223
+
224
+ Rent an RTX 3090 node, SSH in, clone repo, run `docker compose up`.
225
+
226
+ ---
227
+
228
+ ## Architecture Diagram
229
+
230
+ ```
231
+ Browser
232
+ β”‚
233
+ β”‚ POST /ocr (multipart image + mode)
234
+ β–Ό
235
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
236
+ β”‚ FastAPI (main.py) β”‚
237
+ β”‚ ─ CORS middleware β”‚
238
+ β”‚ ─ file validation (type, size) β”‚
239
+ β”‚ ─ session metrics β”‚
240
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
241
+ β”‚ image_bytes, mode
242
+ β–Ό
243
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
244
+ β”‚ GotOcrEngine (ocr_engine.py) β”‚
245
+ β”‚ ─ PIL validation & RGB conversion β”‚
246
+ β”‚ ─ writes temp PNG to disk β”‚
247
+ β”‚ ─ model.chat(tokenizer, path, …) β”‚
248
+ β”‚ ─ returns OcrResult dataclass β”‚
249
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
250
+ β”‚ (torch.inference_mode)
251
+ β–Ό
252
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
253
+ β”‚ GOT-OCR 2.0 Model β”‚
254
+ β”‚ stepfun-ai/GOT-OCR2_0 β”‚
255
+ β”‚ ─ Vision encoder (SigLIP) β”‚
256
+ β”‚ ─ LM backbone (Qwen2-0.5B) β”‚
257
+ β”‚ ─ Runs on CUDA / CPU / MPS β”‚
258
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
259
+ ```
260
+
261
+ ---
262
+
263
+ ## What makes this a good portfolio project?
264
+
265
+ - **Self-hosted ML inference** β€” no API dependency, model runs on your server
266
+ - **End-to-end system** β€” frontend + REST API + ML pipeline + Docker
267
+ - **Production patterns** β€” lifespan events, CORS, input validation, error handling, metrics endpoint, health check
268
+ - **Real ML engineering** β€” model loading, device management, temp file handling, `torch.inference_mode()`
269
+ - **Clean code** β€” dataclasses, logging, type hints, docstrings
270
+
271
+ ---
272
+
273
+ ## References
274
+
275
+ - Paper: [GOT-OCR 2.0](https://arxiv.org/abs/2409.01704)
276
+ - Model: [stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)
277
+ - Code: [GOT-OCR2.0 GitHub](https://github.com/Ucas-HaoranWei/GOT-OCR2.0)
background.js ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // background.js β€” Service worker
2
+ // Handles: tab screenshot, image crop, OCR API call, result relay
3
+
4
+ const OCR_ENDPOINT = "http://localhost:8000/ocr";
5
+ const OCR_MODE = "recognize"; // or "parse"
6
+
7
+ // ── Listen for messages from content.js ─────────────────────────────────────
8
+
9
+ chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
10
+
11
+ if (msg.type === "CAPTURE_REGION") {
12
+ handleCapture(msg.rect, sender.tab)
13
+ .then(result => sendResponse({ success: true, ...result }))
14
+ .catch(error => sendResponse({ success: false, error: error.message }));
15
+ return true; // keep channel open for async
16
+ }
17
+
18
+ if (msg.type === "PING") {
19
+ checkServer().then(ok => sendResponse({ ok }));
20
+ return true;
21
+ }
22
+
23
+ if (msg.type === "OPEN_SIDEBAR") {
24
+ // Open the sidebar as a side panel in the current tab
25
+ chrome.tabs.sendMessage(sender.tab.id, { type: "SHOW_SIDEBAR" });
26
+ return false;
27
+ }
28
+
29
+ });
30
+
31
+ // ── Capture + crop + OCR ─────────────────────────────────────────────────────
32
+
33
+ async function handleCapture(rect, tab) {
34
+ // 1. Capture the entire visible tab as a data URL
35
+ const dataUrl = await chrome.tabs.captureVisibleTab(tab.windowId, {
36
+ format: "png",
37
+ quality: 100,
38
+ });
39
+
40
+ // 2. Crop to the selected rect using OffscreenCanvas
41
+ const croppedBlob = await cropImage(dataUrl, rect);
42
+
43
+ // 3. Send to GLM-OCR backend
44
+ const formData = new FormData();
45
+ formData.append("file", croppedBlob, "selection.png");
46
+ formData.append("mode", OCR_MODE);
47
+
48
+ const res = await fetch(OCR_ENDPOINT, {
49
+ method: "POST",
50
+ body: formData,
51
+ });
52
+
53
+ if (!res.ok) {
54
+ const err = await res.json().catch(() => ({}));
55
+ throw new Error(err.detail || `Server returned ${res.status}`);
56
+ }
57
+
58
+ const data = await res.json();
59
+
60
+ // Also store the cropped image as a data URL for display in the sidebar
61
+ const croppedDataUrl = await blobToDataUrl(croppedBlob);
62
+
63
+ return {
64
+ text: data.text,
65
+ word_count: data.word_count,
66
+ char_count: data.char_count,
67
+ latency_ms: data.latency_ms,
68
+ mode: data.mode,
69
+ device: data.device,
70
+ imageDataUrl: croppedDataUrl,
71
+ timestamp: new Date().toISOString(),
72
+ };
73
+ }
74
+
75
+ // ── Image cropping using OffscreenCanvas ─────────────────────────────────────
76
+
77
+ async function cropImage(dataUrl, rect) {
78
+ // Decode the full screenshot
79
+ const res = await fetch(dataUrl);
80
+ const blob = await res.blob();
81
+ const bitmap = await createImageBitmap(blob);
82
+
83
+ // Scale rect by device pixel ratio (already baked into captureVisibleTab)
84
+ // captureVisibleTab captures at device pixel ratio already, so rect coords
85
+ // from getBoundingClientRect need to be scaled.
86
+ const dpr = rect.dpr || 1;
87
+ const sx = Math.round(rect.x * dpr);
88
+ const sy = Math.round(rect.y * dpr);
89
+ const sw = Math.round(rect.width * dpr);
90
+ const sh = Math.round(rect.height * dpr);
91
+
92
+ // Clamp to bitmap bounds
93
+ const cx = Math.max(0, sx);
94
+ const cy = Math.max(0, sy);
95
+ const cw = Math.min(sw, bitmap.width - cx);
96
+ const ch = Math.min(sh, bitmap.height - cy);
97
+
98
+ const canvas = new OffscreenCanvas(cw, ch);
99
+ const ctx = canvas.getContext("2d");
100
+ ctx.drawImage(bitmap, cx, cy, cw, ch, 0, 0, cw, ch);
101
+
102
+ return canvas.convertToBlob({ type: "image/png" });
103
+ }
104
+
105
+ function blobToDataUrl(blob) {
106
+ return new Promise((resolve, reject) => {
107
+ const reader = new FileReader();
108
+ reader.onload = () => resolve(reader.result);
109
+ reader.onerror = reject;
110
+ reader.readAsDataURL(blob);
111
+ });
112
+ }
113
+
114
+ // ── Server health check ───────────────────────────────────────────────────────
115
+
116
+ async function checkServer() {
117
+ try {
118
+ const r = await fetch("http://localhost:8000/health", { signal: AbortSignal.timeout(3000) });
119
+ const d = await r.json();
120
+ return d.status === "ok";
121
+ } catch {
122
+ return false;
123
+ }
124
+ }
content.css ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* content.css β€” Styles injected into every page for the selection overlay */
2
+
3
+ /* ── Overlay ── */
4
+ #glmocr-overlay {
5
+ position: fixed !important;
6
+ inset: 0 !important;
7
+ background: rgba(0, 0, 0, 0.45) !important;
8
+ z-index: 2147483646 !important;
9
+ cursor: crosshair !important;
10
+ user-select: none !important;
11
+ }
12
+
13
+ /* ── Hint text ── */
14
+ #glmocr-hint {
15
+ position: absolute !important;
16
+ top: 20px !important;
17
+ left: 50% !important;
18
+ transform: translateX(-50%) !important;
19
+ background: rgba(0, 0, 0, 0.8) !important;
20
+ color: #f5f0e8 !important;
21
+ font-family: 'IBM Plex Mono', monospace, monospace !important;
22
+ font-size: 13px !important;
23
+ padding: 10px 18px !important;
24
+ border-radius: 4px !important;
25
+ letter-spacing: 0.04em !important;
26
+ pointer-events: none !important;
27
+ white-space: nowrap !important;
28
+ border: 1px solid rgba(255,255,255,0.15) !important;
29
+ transition: opacity 0.2s !important;
30
+ }
31
+
32
+ /* ── Selection box ── */
33
+ #glmocr-selbox {
34
+ position: fixed !important;
35
+ display: none !important;
36
+ border: 2px solid #c94a1f !important;
37
+ background: rgba(201, 74, 31, 0.08) !important;
38
+ box-shadow: 0 0 0 9999px rgba(0, 0, 0, 0.35) !important;
39
+ pointer-events: none !important;
40
+ z-index: 2147483647 !important;
41
+ }
42
+
43
+ /* ── Sidebar iframe ── */
44
+ #glmocr-sidebar {
45
+ position: fixed !important;
46
+ top: 0 !important;
47
+ right: 0 !important;
48
+ width: 380px !important;
49
+ height: 100vh !important;
50
+ border: none !important;
51
+ z-index: 2147483645 !important;
52
+ border-left: 2px solid #d4cfc3 !important;
53
+ box-shadow: -4px 0 24px rgba(0,0,0,0.12) !important;
54
+ animation: glmocr-slideIn 0.25s cubic-bezier(0.22, 1, 0.36, 1) !important;
55
+ }
56
+
57
+ @keyframes glmocr-slideIn {
58
+ from { transform: translateX(100%); opacity: 0; }
59
+ to { transform: translateX(0); opacity: 1; }
60
+ }
61
+
62
+ /* ── Toast ── */
63
+ #glmocr-toast {
64
+ position: fixed !important;
65
+ bottom: 24px !important;
66
+ left: 50% !important;
67
+ transform: translateX(-50%) !important;
68
+ background: #0f0e0d !important;
69
+ color: #f5f0e8 !important;
70
+ font-family: 'IBM Plex Mono', monospace, monospace !important;
71
+ font-size: 13px !important;
72
+ padding: 10px 20px !important;
73
+ border-radius: 4px !important;
74
+ z-index: 2147483647 !important;
75
+ white-space: nowrap !important;
76
+ animation: glmocr-fadeUp 0.3s ease both !important;
77
+ }
78
+
79
+ @keyframes glmocr-fadeUp {
80
+ from { opacity: 0; transform: translateX(-50%) translateY(12px); }
81
+ to { opacity: 1; transform: translateX(-50%) translateY(0); }
82
+ }
content.js ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // content.js β€” Injected into every page
2
+ // Manages the screen selection overlay and sidebar panel
3
+
4
+ let overlayActive = false;
5
+ let sidebarFrame = null;
6
+
7
+ // ── Listen for messages from background / popup ───────────────────────────────
8
+
9
+ chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
10
+
11
+ if (msg.type === "START_SELECTION") {
12
+ if (!overlayActive) startSelection();
13
+ sendResponse({ ok: true });
14
+ return false;
15
+ }
16
+
17
+ if (msg.type === "SHOW_SIDEBAR") {
18
+ showSidebar({});
19
+ return false;
20
+ }
21
+
22
+ if (msg.type === "SHOW_RESULT") {
23
+ showSidebar(msg.data);
24
+ return false;
25
+ }
26
+
27
+ });
28
+
29
+ // ── Selection overlay ─────────────────────────────────────────────────────────
30
+
31
+ function startSelection() {
32
+ overlayActive = true;
33
+
34
+ // Dim the page
35
+ const overlay = document.createElement("div");
36
+ overlay.id = "glmocr-overlay";
37
+
38
+ // Crosshair hint
39
+ const hint = document.createElement("div");
40
+ hint.id = "glmocr-hint";
41
+ hint.textContent = "Drag to select text region β€” Press Esc to cancel";
42
+ overlay.appendChild(hint);
43
+
44
+ // Selection box
45
+ const selBox = document.createElement("div");
46
+ selBox.id = "glmocr-selbox";
47
+ overlay.appendChild(selBox);
48
+
49
+ document.body.appendChild(overlay);
50
+
51
+ let startX = 0, startY = 0, isDragging = false;
52
+
53
+ function onMouseDown(e) {
54
+ if (e.button !== 0) return;
55
+ isDragging = true;
56
+ startX = e.clientX;
57
+ startY = e.clientY;
58
+ selBox.style.cssText = `left:${startX}px; top:${startY}px; width:0; height:0; display:block`;
59
+ hint.style.opacity = "0";
60
+ e.preventDefault();
61
+ }
62
+
63
+ function onMouseMove(e) {
64
+ if (!isDragging) return;
65
+ const x = Math.min(e.clientX, startX);
66
+ const y = Math.min(e.clientY, startY);
67
+ const w = Math.abs(e.clientX - startX);
68
+ const h = Math.abs(e.clientY - startY);
69
+ selBox.style.cssText = `left:${x}px; top:${y}px; width:${w}px; height:${h}px; display:block`;
70
+ }
71
+
72
+ function onMouseUp(e) {
73
+ if (!isDragging) return;
74
+ isDragging = false;
75
+
76
+ const x = Math.min(e.clientX, startX);
77
+ const y = Math.min(e.clientY, startY);
78
+ const w = Math.abs(e.clientX - startX);
79
+ const h = Math.abs(e.clientY - startY);
80
+
81
+ cleanup();
82
+
83
+ if (w < 10 || h < 10) {
84
+ showToast("Selection too small β€” try again.");
85
+ return;
86
+ }
87
+
88
+ const dpr = window.devicePixelRatio || 1;
89
+ const rect = {
90
+ x: x + window.scrollX,
91
+ y: y + window.scrollY,
92
+ width: w,
93
+ height: h,
94
+ dpr,
95
+ };
96
+
97
+ runOcr(rect);
98
+ }
99
+
100
+ function onKeyDown(e) {
101
+ if (e.key === "Escape") {
102
+ cleanup();
103
+ showToast("Cancelled.");
104
+ }
105
+ }
106
+
107
+ function cleanup() {
108
+ overlayActive = false;
109
+ overlay.removeEventListener("mousedown", onMouseDown);
110
+ overlay.removeEventListener("mousemove", onMouseMove);
111
+ overlay.removeEventListener("mouseup", onMouseUp);
112
+ document.removeEventListener("keydown", onKeyDown);
113
+ overlay.remove();
114
+ }
115
+
116
+ overlay.addEventListener("mousedown", onMouseDown);
117
+ overlay.addEventListener("mousemove", onMouseMove);
118
+ overlay.addEventListener("mouseup", onMouseUp);
119
+ document.addEventListener("keydown", onKeyDown);
120
+ }
121
+
122
+ // ── Send region to background for capture + OCR ───────────────────────────────
123
+
124
+ function runOcr(rect) {
125
+ // Show a loading sidebar immediately
126
+ showSidebar({ loading: true });
127
+
128
+ chrome.runtime.sendMessage({ type: "CAPTURE_REGION", rect }, (response) => {
129
+ if (chrome.runtime.lastError) {
130
+ showSidebar({ error: chrome.runtime.lastError.message });
131
+ return;
132
+ }
133
+ if (response.success) {
134
+ showSidebar(response);
135
+ } else {
136
+ showSidebar({ error: response.error });
137
+ }
138
+ });
139
+ }
140
+
141
+ // ── Sidebar panel ─────────────────────────────────────────────────────────────
142
+
143
+ function showSidebar(data) {
144
+ // Remove existing sidebar if any
145
+ if (sidebarFrame) sidebarFrame.remove();
146
+
147
+ const frame = document.createElement("iframe");
148
+ frame.id = "glmocr-sidebar";
149
+ frame.src = chrome.runtime.getURL("sidebar.html");
150
+
151
+ document.body.appendChild(frame);
152
+ sidebarFrame = frame;
153
+
154
+ // Wait for iframe to load, then send data
155
+ frame.onload = () => {
156
+ frame.contentWindow.postMessage({ type: "SIDEBAR_DATA", data }, "*");
157
+ };
158
+
159
+ // Close button via message from sidebar
160
+ window.addEventListener("message", (e) => {
161
+ if (e.data?.type === "CLOSE_SIDEBAR") {
162
+ frame.remove();
163
+ sidebarFrame = null;
164
+ }
165
+ if (e.data?.type === "START_NEW_SELECTION") {
166
+ frame.remove();
167
+ sidebarFrame = null;
168
+ startSelection();
169
+ }
170
+ });
171
+ }
172
+
173
+ // ── Toast notification ────────────────────────────────────────────────────────
174
+
175
+ function showToast(msg) {
176
+ const existing = document.getElementById("glmocr-toast");
177
+ if (existing) existing.remove();
178
+
179
+ const toast = document.createElement("div");
180
+ toast.id = "glmocr-toast";
181
+ toast.textContent = msg;
182
+ document.body.appendChild(toast);
183
+ setTimeout(() => toast?.remove(), 3000);
184
+ }
docker-compose.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+
3
+ services:
4
+ got-ocr:
5
+ build: .
6
+ image: got-ocr:latest
7
+ container_name: got-ocr-backend
8
+ ports:
9
+ - "8000:8000"
10
+ environment:
11
+ - PYTHONUNBUFFERED=1
12
+ restart: unless-stopped
13
+
14
+ # ── GPU support (comment out for CPU-only) ───────────────────────────
15
+ deploy:
16
+ resources:
17
+ reservations:
18
+ devices:
19
+ - driver: nvidia
20
+ count: 1
21
+ capabilities: [gpu]
22
+
23
+ # ── Health check ─────────────────────────────────────────────────────
24
+ healthcheck:
25
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
26
+ interval: 30s
27
+ timeout: 10s
28
+ retries: 5
29
+ start_period: 60s # give model time to load
30
+
31
+ # ── Optional: persist HuggingFace model cache across rebuilds ────────
32
+ volumes:
33
+ - hf_cache:/root/.cache/huggingface
34
+
35
+ volumes:
36
+ hf_cache:
frontend/index.html ADDED
@@ -0,0 +1,759 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>GLM-OCR β€” Self-Hosted Document OCR</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com"/>
8
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;700&family=DM+Serif+Display:ital@0;1&family=DM+Sans:wght@300;400;500&display=swap" rel="stylesheet"/>
9
+ <style>
10
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
11
+
12
+ :root {
13
+ --ink: #0f0e0d;
14
+ --paper: #f5f0e8;
15
+ --warm: #ede8dc;
16
+ --border: #d4cfc3;
17
+ --muted: #8f8880;
18
+ --accent: #c94a1f;
19
+ --green: #1a6b4a;
20
+ --mono: 'IBM Plex Mono', monospace;
21
+ --serif: 'DM Serif Display', serif;
22
+ --sans: 'DM Sans', sans-serif;
23
+ }
24
+
25
+ html { scroll-behavior: smooth; }
26
+
27
+ body {
28
+ background: var(--paper);
29
+ color: var(--ink);
30
+ font-family: var(--sans);
31
+ min-height: 100vh;
32
+ }
33
+
34
+ body::before {
35
+ content: '';
36
+ position: fixed;
37
+ inset: 0;
38
+ background-image: radial-gradient(circle, rgba(0,0,0,0.055) 1px, transparent 1px);
39
+ background-size: 18px 18px;
40
+ pointer-events: none;
41
+ z-index: 0;
42
+ }
43
+
44
+ .page { position: relative; z-index: 1; }
45
+
46
+ /* ── MASTHEAD ── */
47
+ .masthead {
48
+ border-bottom: 3px solid var(--ink);
49
+ padding: 0 48px;
50
+ display: grid;
51
+ grid-template-columns: 1fr auto 1fr;
52
+ align-items: center;
53
+ min-height: 68px;
54
+ gap: 16px;
55
+ }
56
+
57
+ .masthead-left {
58
+ font-family: var(--mono);
59
+ font-size: 0.62rem;
60
+ color: var(--muted);
61
+ letter-spacing: 0.08em;
62
+ text-transform: uppercase;
63
+ }
64
+
65
+ .masthead-center {
66
+ font-family: var(--serif);
67
+ font-size: 1.3rem;
68
+ white-space: nowrap;
69
+ }
70
+
71
+ .masthead-right {
72
+ display: flex;
73
+ justify-content: flex-end;
74
+ gap: 6px;
75
+ }
76
+
77
+ .pill {
78
+ font-family: var(--mono);
79
+ font-size: 0.6rem;
80
+ letter-spacing: 0.08em;
81
+ text-transform: uppercase;
82
+ padding: 4px 9px;
83
+ border-radius: 2px;
84
+ border: 1px solid var(--border);
85
+ color: var(--muted);
86
+ display: flex;
87
+ align-items: center;
88
+ gap: 5px;
89
+ }
90
+
91
+ .pill.live { border-color: var(--green); color: var(--green); }
92
+
93
+ .status-dot {
94
+ width: 6px; height: 6px;
95
+ border-radius: 50%;
96
+ background: var(--muted);
97
+ flex-shrink: 0;
98
+ }
99
+ .status-dot.ok { background: var(--green); }
100
+ .status-dot.err { background: var(--accent); }
101
+ .status-dot.pulse {
102
+ animation: blink 1.2s ease-in-out infinite;
103
+ }
104
+
105
+ @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} }
106
+
107
+ /* ── HERO ── */
108
+ .hero {
109
+ padding: 64px 48px 48px;
110
+ border-bottom: 1px solid var(--border);
111
+ display: grid;
112
+ grid-template-columns: 1fr 1fr;
113
+ gap: 48px;
114
+ align-items: end;
115
+ }
116
+
117
+ .hero-headline {
118
+ font-family: var(--serif);
119
+ font-size: clamp(2.8rem, 5.5vw, 5rem);
120
+ line-height: 1.02;
121
+ letter-spacing: -0.02em;
122
+ }
123
+
124
+ .hero-headline em { font-style: italic; color: var(--accent); }
125
+
126
+ .hero-right { display: flex; flex-direction: column; gap: 20px; }
127
+
128
+ .hero-desc {
129
+ font-size: 0.88rem;
130
+ color: var(--muted);
131
+ line-height: 1.75;
132
+ }
133
+
134
+ .hero-stats {
135
+ display: flex;
136
+ gap: 24px;
137
+ flex-wrap: wrap;
138
+ }
139
+
140
+ .stat { display: flex; flex-direction: column; gap: 2px; }
141
+
142
+ .stat strong {
143
+ font-family: var(--serif);
144
+ font-size: 1.5rem;
145
+ color: var(--accent);
146
+ }
147
+
148
+ .stat span {
149
+ font-family: var(--mono);
150
+ font-size: 0.58rem;
151
+ color: var(--muted);
152
+ letter-spacing: 0.1em;
153
+ text-transform: uppercase;
154
+ }
155
+
156
+ /* ── MAIN ── */
157
+ .main {
158
+ display: grid;
159
+ grid-template-columns: 1fr 1fr;
160
+ border-bottom: 1px solid var(--border);
161
+ }
162
+
163
+ .col { padding: 36px 48px; }
164
+ .col + .col { border-left: 1px solid var(--border); }
165
+
166
+ .col-label {
167
+ font-family: var(--mono);
168
+ font-size: 0.62rem;
169
+ color: var(--muted);
170
+ letter-spacing: 0.12em;
171
+ text-transform: uppercase;
172
+ margin-bottom: 20px;
173
+ display: flex;
174
+ align-items: center;
175
+ gap: 8px;
176
+ }
177
+
178
+ .col-label::after { content: ''; flex: 1; height: 1px; background: var(--border); }
179
+
180
+ /* ── DROPZONE ── */
181
+ #dropzone {
182
+ border: 2px dashed var(--border);
183
+ border-radius: 4px;
184
+ min-height: 240px;
185
+ display: flex;
186
+ flex-direction: column;
187
+ align-items: center;
188
+ justify-content: center;
189
+ gap: 14px;
190
+ cursor: pointer;
191
+ transition: all 0.2s;
192
+ position: relative;
193
+ overflow: hidden;
194
+ }
195
+
196
+ #dropzone:hover, #dropzone.over {
197
+ border-color: var(--accent);
198
+ background: rgba(201,74,31,0.04);
199
+ }
200
+
201
+ #dropzone.over::after {
202
+ content: 'Drop!';
203
+ position: absolute; inset: 0;
204
+ background: rgba(201,74,31,0.08);
205
+ display: grid; place-items: center;
206
+ font-family: var(--serif);
207
+ font-size: 2rem;
208
+ color: var(--accent);
209
+ }
210
+
211
+ .dz-icon { font-size: 2.2rem; }
212
+
213
+ .dz-label strong { display: block; font-weight: 500; font-size: 0.88rem; margin-bottom: 5px; text-align:center; }
214
+ .dz-label span { font-family: var(--mono); font-size: 0.64rem; color: var(--muted); }
215
+
216
+ #file-input { display: none; }
217
+
218
+ /* Preview */
219
+ #preview-wrap { display: none; }
220
+ #preview-wrap.active { display: block; }
221
+
222
+ #preview-img {
223
+ width: 100%; max-height: 240px;
224
+ object-fit: contain;
225
+ border: 1px solid var(--border);
226
+ border-radius: 2px;
227
+ background: var(--warm);
228
+ }
229
+
230
+ .file-meta {
231
+ margin-top: 8px;
232
+ font-family: var(--mono);
233
+ font-size: 0.65rem;
234
+ color: var(--muted);
235
+ display: flex;
236
+ justify-content: space-between;
237
+ }
238
+
239
+ /* ── MODE ── */
240
+ .mode-row {
241
+ margin: 18px 0 14px;
242
+ display: flex;
243
+ gap: 8px;
244
+ }
245
+
246
+ .mode-btn {
247
+ font-family: var(--mono);
248
+ font-size: 0.67rem;
249
+ letter-spacing: 0.04em;
250
+ padding: 9px 14px;
251
+ border: 1px solid var(--border);
252
+ background: transparent;
253
+ color: var(--muted);
254
+ cursor: pointer;
255
+ border-radius: 2px;
256
+ transition: all 0.15s;
257
+ display: flex;
258
+ flex-direction: column;
259
+ gap: 2px;
260
+ flex: 1;
261
+ text-align: left;
262
+ }
263
+
264
+ .mode-btn .mode-name { font-weight: 700; color: var(--ink); }
265
+ .mode-btn .mode-desc { font-size: 0.58rem; }
266
+
267
+ .mode-btn.selected {
268
+ background: var(--ink);
269
+ border-color: var(--ink);
270
+ color: var(--paper);
271
+ }
272
+
273
+ .mode-btn.selected .mode-name { color: var(--paper); }
274
+
275
+ /* ── RUN BTN ── */
276
+ .run-btn {
277
+ width: 100%;
278
+ padding: 14px;
279
+ background: var(--accent);
280
+ color: white;
281
+ border: none;
282
+ border-radius: 2px;
283
+ font-family: var(--serif);
284
+ font-size: 1.05rem;
285
+ cursor: pointer;
286
+ transition: background 0.15s;
287
+ }
288
+
289
+ .run-btn:hover:not(:disabled) { background: #b53d15; }
290
+ .run-btn:disabled { opacity: 0.35; cursor: not-allowed; }
291
+
292
+ .clear-link {
293
+ font-family: var(--mono);
294
+ font-size: 0.64rem;
295
+ color: var(--muted);
296
+ text-decoration: underline;
297
+ cursor: pointer;
298
+ display: none;
299
+ margin-top: 10px;
300
+ text-align: center;
301
+ }
302
+
303
+ /* ── OUTPUT ── */
304
+ .output-area {
305
+ min-height: 300px;
306
+ display: flex;
307
+ flex-direction: column;
308
+ }
309
+
310
+ #out-placeholder {
311
+ flex: 1;
312
+ display: flex;
313
+ flex-direction: column;
314
+ align-items: center;
315
+ justify-content: center;
316
+ gap: 10px;
317
+ border: 1px dashed var(--border);
318
+ border-radius: 2px;
319
+ }
320
+
321
+ #out-placeholder .ph { font-size: 2rem; opacity: 0.3; }
322
+ #out-placeholder p { font-family: var(--mono); font-size: 0.68rem; color: var(--muted); text-align: center; line-height: 1.9; }
323
+
324
+ /* Loading */
325
+ #out-loading { display: none; flex: 1; flex-direction: column; align-items: center; justify-content: center; gap: 16px; }
326
+ #out-loading.active { display: flex; }
327
+
328
+ .scan-bar-wrap { width: 160px; height: 3px; background: var(--border); border-radius: 2px; overflow: hidden; }
329
+ .scan-bar { height: 100%; background: var(--accent); border-radius: 2px; animation: scan 1.4s ease-in-out infinite; }
330
+ @keyframes scan { 0%{transform:translateX(-100%)} 50%{transform:translateX(0)} 100%{transform:translateX(100%)} }
331
+
332
+ .scan-label { font-family: var(--mono); font-size: 0.68rem; color: var(--muted); animation: blink 1.4s ease-in-out infinite; }
333
+
334
+ /* Error */
335
+ #out-error { display: none; background: #fff0f0; border: 1px solid rgba(201,74,31,0.3); border-radius: 2px; padding: 16px; font-family: var(--mono); font-size: 0.72rem; color: var(--accent); line-height: 1.7; }
336
+ #out-error.active { display: block; }
337
+
338
+ /* Result */
339
+ #out-result { display: none; flex-direction: column; gap: 10px; }
340
+ #out-result.active { display: flex; }
341
+
342
+ #result-meta { display: flex; gap: 14px; flex-wrap: wrap; }
343
+
344
+ .chip { font-family: var(--mono); font-size: 0.62rem; color: var(--muted); }
345
+ .chip strong { color: var(--green); }
346
+
347
+ #result-content {
348
+ background: var(--warm);
349
+ border: 1px solid var(--border);
350
+ border-radius: 2px;
351
+ padding: 18px;
352
+ font-family: var(--mono);
353
+ font-size: 0.78rem;
354
+ line-height: 1.9;
355
+ white-space: pre-wrap;
356
+ word-break: break-word;
357
+ max-height: 340px;
358
+ overflow-y: auto;
359
+ flex: 1;
360
+ }
361
+
362
+ .result-actions { display: flex; gap: 8px; }
363
+
364
+ .action-btn {
365
+ font-family: var(--mono);
366
+ font-size: 0.65rem;
367
+ letter-spacing: 0.05em;
368
+ padding: 9px 14px;
369
+ border: 1px solid var(--border);
370
+ background: transparent;
371
+ color: var(--ink);
372
+ cursor: pointer;
373
+ border-radius: 2px;
374
+ transition: border-color 0.15s;
375
+ flex: 1;
376
+ }
377
+
378
+ .action-btn:hover { border-color: var(--ink); }
379
+
380
+ /* ── STATUS BAR ── */
381
+ .statusbar {
382
+ border-top: 3px double var(--border);
383
+ padding: 14px 48px;
384
+ display: flex;
385
+ gap: 32px;
386
+ flex-wrap: wrap;
387
+ font-family: var(--mono);
388
+ font-size: 0.64rem;
389
+ color: var(--muted);
390
+ }
391
+
392
+ .statusbar strong { color: var(--green); }
393
+
394
+ footer {
395
+ border-top: 1px solid var(--border);
396
+ padding: 18px 48px;
397
+ display: flex;
398
+ justify-content: space-between;
399
+ font-family: var(--mono);
400
+ font-size: 0.62rem;
401
+ color: var(--muted);
402
+ }
403
+
404
+ footer a { color: var(--ink); text-decoration: underline; }
405
+
406
+ /* ── TOAST ── */
407
+ .toast {
408
+ position: fixed;
409
+ bottom: 24px; right: 24px;
410
+ background: var(--ink);
411
+ color: var(--paper);
412
+ font-family: var(--mono);
413
+ font-size: 0.7rem;
414
+ padding: 11px 18px;
415
+ border-radius: 2px;
416
+ transform: translateY(60px);
417
+ opacity: 0;
418
+ transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
419
+ z-index: 999;
420
+ }
421
+
422
+ .toast.show { transform: translateY(0); opacity: 1; }
423
+
424
+ @keyframes fadeUp { from{opacity:0;transform:translateY(16px)} to{opacity:1;transform:translateY(0)} }
425
+ .masthead { animation: fadeUp 0.5s ease both; }
426
+ .hero { animation: fadeUp 0.5s 0.08s ease both; }
427
+ .main { animation: fadeUp 0.5s 0.16s ease both; }
428
+
429
+ @media (max-width: 820px) {
430
+ .masthead, .hero, .col, .statusbar, footer { padding-left: 24px; padding-right: 24px; }
431
+ .hero, .main { grid-template-columns: 1fr; }
432
+ .col + .col { border-left: none; border-top: 1px solid var(--border); }
433
+ }
434
+ </style>
435
+ </head>
436
+ <body>
437
+ <div class="page">
438
+
439
+ <!-- MASTHEAD -->
440
+ <div class="masthead">
441
+ <div class="masthead-left">zai-org/GLM-OCR Β· 0.9B params</div>
442
+ <div class="masthead-center">GLM-OCR Engine</div>
443
+ <div class="masthead-right">
444
+ <div class="pill" id="server-pill">
445
+ <div class="status-dot pulse" id="status-dot"></div>
446
+ <span id="status-label">connecting…</span>
447
+ </div>
448
+ <div class="pill live">self-hosted</div>
449
+ </div>
450
+ </div>
451
+
452
+ <!-- HERO -->
453
+ <section class="hero">
454
+ <div>
455
+ <h1 class="hero-headline">GLM<br><em>Vision</em><br>OCR</h1>
456
+ </div>
457
+ <div class="hero-right">
458
+ <p class="hero-desc">
459
+ Self-hosted OCR powered by <strong>zai-org/GLM-OCR</strong> β€” a 0.9B vision-language model
460
+ ranking #1 on OmniDocBench V1.5. Handles plain text, tables, math formulas,
461
+ and structured document parsing.
462
+ </p>
463
+ <div class="hero-stats">
464
+ <div class="stat"><strong id="stat-count">0</strong><span>Processed</span></div>
465
+ <div class="stat"><strong id="stat-words">0</strong><span>Words</span></div>
466
+ <div class="stat"><strong id="stat-lat">β€”</strong><span>Avg Latency</span></div>
467
+ </div>
468
+ </div>
469
+ </section>
470
+
471
+ <!-- MAIN -->
472
+ <div class="main">
473
+
474
+ <!-- LEFT -->
475
+ <div class="col">
476
+ <div class="col-label">01 &nbsp; Upload Image</div>
477
+
478
+ <div id="dropzone">
479
+ <div class="dz-icon">πŸ–Ό</div>
480
+ <div class="dz-label">
481
+ <strong>Drag & drop an image</strong>
482
+ <span>PNG Β· JPG Β· WEBP Β· BMP Β· TIFF Β· Max 20 MB</span>
483
+ </div>
484
+ <input type="file" id="file-input" accept="image/*"/>
485
+ </div>
486
+
487
+ <div id="preview-wrap">
488
+ <img id="preview-img" src="" alt="Preview"/>
489
+ <div class="file-meta">
490
+ <span id="file-name"></span>
491
+ <span id="file-size"></span>
492
+ </div>
493
+ </div>
494
+
495
+ <!-- Mode selector -->
496
+ <div class="mode-row">
497
+ <button class="mode-btn selected" data-mode="recognize">
498
+ <span class="mode-name">recognize</span>
499
+ <span class="mode-desc">Plain text Β· preserves layout</span>
500
+ </button>
501
+ <button class="mode-btn" data-mode="parse">
502
+ <span class="mode-name">parse</span>
503
+ <span class="mode-desc">Structured markdown output</span>
504
+ </button>
505
+ </div>
506
+
507
+ <button class="run-btn" id="run-btn" disabled>⟑ &nbsp;Run GLM-OCR</button>
508
+ <div class="clear-link" id="clear-link">Clear image</div>
509
+ </div>
510
+
511
+ <!-- RIGHT -->
512
+ <div class="col">
513
+ <div class="col-label">02 &nbsp; Extracted Text</div>
514
+
515
+ <div class="output-area">
516
+ <div id="out-placeholder">
517
+ <div class="ph">πŸ“„</div>
518
+ <p>Upload an image and click<br>"Run GLM-OCR" to begin.</p>
519
+ </div>
520
+
521
+ <div id="out-loading">
522
+ <div class="scan-bar-wrap"><div class="scan-bar"></div></div>
523
+ <div class="scan-label" id="loading-label">Initialising…</div>
524
+ </div>
525
+
526
+ <div id="out-error"></div>
527
+
528
+ <div id="out-result">
529
+ <div id="result-meta"></div>
530
+ <div id="result-content"></div>
531
+ <div class="result-actions">
532
+ <button class="action-btn" id="copy-btn">Copy text</button>
533
+ <button class="action-btn" id="dl-btn">Download .txt</button>
534
+ </div>
535
+ </div>
536
+ </div>
537
+ </div>
538
+ </div>
539
+
540
+ <!-- STATUS BAR -->
541
+ <div class="statusbar">
542
+ <span>Model: <strong id="sb-model">β€”</strong></span>
543
+ <span>Device: <strong id="sb-device">β€”</strong></span>
544
+ <span>Uptime: <strong id="sb-uptime">β€”</strong></span>
545
+ <span>Errors: <strong id="sb-errors">β€”</strong></span>
546
+ </div>
547
+
548
+ <footer>
549
+ <span>GLM-OCR Β· <a href="https://arxiv.org/abs/2603.10910" target="_blank">Paper β†—</a> Β· <a href="https://huggingface.co/zai-org/GLM-OCR" target="_blank">HuggingFace β†—</a></span>
550
+ <span>Self-hosted Β· No data leaves your server Β· CS Portfolio Project</span>
551
+ </footer>
552
+ </div>
553
+
554
+ <div class="toast" id="toast"></div>
555
+
556
+ <script>
557
+ const API = '';
558
+ let selectedMode = 'recognize';
559
+ let imageFile = null;
560
+
561
+ // Elements
562
+ const dropzone = document.getElementById('dropzone');
563
+ const fileInput = document.getElementById('file-input');
564
+ const previewWrap= document.getElementById('preview-wrap');
565
+ const previewImg = document.getElementById('preview-img');
566
+ const runBtn = document.getElementById('run-btn');
567
+ const clearLink = document.getElementById('clear-link');
568
+
569
+ const outPlaceholder = document.getElementById('out-placeholder');
570
+ const outLoading = document.getElementById('out-loading');
571
+ const outError = document.getElementById('out-error');
572
+ const outResult = document.getElementById('out-result');
573
+ const loadingLabel = document.getElementById('loading-label');
574
+ const resultMeta = document.getElementById('result-meta');
575
+ const resultContent = document.getElementById('result-content');
576
+
577
+ // ── Health ──────────────────────────────────────────────────────────────
578
+ async function pollHealth() {
579
+ try {
580
+ const r = await fetch(`${API}/health`);
581
+ const data = await r.json();
582
+ const dot = document.getElementById('status-dot');
583
+ const lbl = document.getElementById('status-label');
584
+
585
+ if (data.status === 'ok') {
586
+ dot.className = 'status-dot ok';
587
+ lbl.textContent = 'model ready';
588
+ document.getElementById('sb-model').textContent = data.model.model_id?.split('/')[1] || 'β€”';
589
+ document.getElementById('sb-device').textContent = data.model.device || 'β€”';
590
+ } else {
591
+ dot.className = 'status-dot pulse';
592
+ lbl.textContent = 'loading model…';
593
+ setTimeout(pollHealth, 3000);
594
+ }
595
+ } catch {
596
+ document.getElementById('status-dot').className = 'status-dot err';
597
+ document.getElementById('status-label').textContent = 'server offline';
598
+ }
599
+ }
600
+
601
+ async function pollMetrics() {
602
+ try {
603
+ const r = await fetch(`${API}/metrics`);
604
+ const data = await r.json();
605
+ document.getElementById('stat-count').textContent = data.total_requests;
606
+ document.getElementById('stat-words').textContent = data.total_words_extracted.toLocaleString();
607
+ document.getElementById('stat-lat').textContent = data.avg_latency_ms
608
+ ? `${(data.avg_latency_ms / 1000).toFixed(1)}s` : 'β€”';
609
+ document.getElementById('sb-uptime').textContent =
610
+ `${Math.floor(data.uptime_seconds / 60)}m ${(data.uptime_seconds % 60) | 0}s`;
611
+ document.getElementById('sb-errors').textContent = data.error_count;
612
+ } catch {}
613
+ }
614
+
615
+ pollHealth();
616
+ pollMetrics();
617
+ setInterval(pollMetrics, 5000);
618
+
619
+ // ── Mode ────────────────────────────────────────────────────────────────
620
+ document.querySelectorAll('.mode-btn').forEach(btn => {
621
+ btn.addEventListener('click', () => {
622
+ document.querySelectorAll('.mode-btn').forEach(b => b.classList.remove('selected'));
623
+ btn.classList.add('selected');
624
+ selectedMode = btn.dataset.mode;
625
+ });
626
+ });
627
+
628
+ // ── File ────────────────────────────────────────────────────────────────
629
+ function loadFile(file) {
630
+ if (!file || !file.type.startsWith('image/')) return;
631
+ imageFile = file;
632
+ const reader = new FileReader();
633
+ reader.onload = e => {
634
+ previewImg.src = e.target.result;
635
+ document.getElementById('file-name').textContent = file.name;
636
+ document.getElementById('file-size').textContent = `${(file.size/1024).toFixed(1)} KB`;
637
+ dropzone.style.display = 'none';
638
+ previewWrap.classList.add('active');
639
+ clearLink.style.display = 'block';
640
+ runBtn.disabled = false;
641
+ resetOutput();
642
+ };
643
+ reader.readAsDataURL(file);
644
+ }
645
+
646
+ dropzone.addEventListener('click', () => fileInput.click());
647
+ fileInput.addEventListener('change', e => loadFile(e.target.files[0]));
648
+ dropzone.addEventListener('dragover', e => { e.preventDefault(); dropzone.classList.add('over'); });
649
+ dropzone.addEventListener('dragleave', () => dropzone.classList.remove('over'));
650
+ dropzone.addEventListener('drop', e => {
651
+ e.preventDefault(); dropzone.classList.remove('over'); loadFile(e.dataTransfer.files[0]);
652
+ });
653
+
654
+ clearLink.addEventListener('click', () => {
655
+ imageFile = null; fileInput.value = '';
656
+ previewWrap.classList.remove('active');
657
+ dropzone.style.display = '';
658
+ clearLink.style.display = 'none';
659
+ runBtn.disabled = true;
660
+ resetOutput();
661
+ });
662
+
663
+ // ── Output state ─────────────────────────────────────────────────────────
664
+ function resetOutput() {
665
+ outPlaceholder.style.display = '';
666
+ outLoading.classList.remove('active');
667
+ outError.classList.remove('active');
668
+ outResult.classList.remove('active');
669
+ }
670
+
671
+ function showLoading(msg) {
672
+ outPlaceholder.style.display = 'none';
673
+ loadingLabel.textContent = msg || 'Running GLM-OCR…';
674
+ outLoading.classList.add('active');
675
+ outError.classList.remove('active');
676
+ outResult.classList.remove('active');
677
+ }
678
+
679
+ function showError(msg) {
680
+ outLoading.classList.remove('active');
681
+ outError.classList.add('active');
682
+ outError.textContent = `⚠ ${msg}`;
683
+ }
684
+
685
+ function showResult(data) {
686
+ outLoading.classList.remove('active');
687
+ outResult.classList.add('active');
688
+ resultMeta.innerHTML = [
689
+ `<span class="chip">words: <strong>${data.word_count}</strong></span>`,
690
+ `<span class="chip">chars: <strong>${data.char_count}</strong></span>`,
691
+ `<span class="chip">latency: <strong>${(data.latency_ms/1000).toFixed(2)}s</strong></span>`,
692
+ `<span class="chip">device: <strong>${data.device}</strong></span>`,
693
+ `<span class="chip">mode: <strong>${data.mode}</strong></span>`,
694
+ ].join('');
695
+ resultContent.textContent = data.text || '[No text detected]';
696
+ pollMetrics();
697
+ }
698
+
699
+ // ── Loading messages ──────────────────────────────────────────────────────
700
+ const msgs = ['Running GLM-OCR…', 'Encoding image…', 'Decoding tokens…', 'Assembling output…'];
701
+ let msgTimer = null;
702
+
703
+ function startLoadingAnim() {
704
+ let i = 0;
705
+ showLoading(msgs[0]);
706
+ msgTimer = setInterval(() => { i = (i+1) % msgs.length; loadingLabel.textContent = msgs[i]; }, 2000);
707
+ }
708
+
709
+ function stopLoadingAnim() { clearInterval(msgTimer); }
710
+
711
+ // ── Run ──────────────────────────────────────────────────────────────────
712
+ runBtn.addEventListener('click', async () => {
713
+ if (!imageFile) return;
714
+ runBtn.disabled = true;
715
+ startLoadingAnim();
716
+
717
+ const form = new FormData();
718
+ form.append('file', imageFile);
719
+ form.append('mode', selectedMode);
720
+
721
+ try {
722
+ const r = await fetch(`${API}/ocr`, { method: 'POST', body: form });
723
+ const data = await r.json();
724
+ if (!r.ok) throw new Error(data.detail || `Error ${r.status}`);
725
+ showResult(data);
726
+ } catch (err) {
727
+ showError(err.message);
728
+ } finally {
729
+ stopLoadingAnim();
730
+ runBtn.disabled = false;
731
+ }
732
+ });
733
+
734
+ // ── Copy ─────────────────────────────────────────────────────────────────
735
+ document.getElementById('copy-btn').addEventListener('click', async () => {
736
+ try { await navigator.clipboard.writeText(resultContent.textContent); toast('Copied!'); }
737
+ catch { toast('Select text manually.'); }
738
+ });
739
+
740
+ // ── Download ─────────────────────────────────────────────────────────────
741
+ document.getElementById('dl-btn').addEventListener('click', () => {
742
+ const blob = new Blob([resultContent.textContent], { type: 'text/plain' });
743
+ const a = document.createElement('a');
744
+ a.href = URL.createObjectURL(blob);
745
+ a.download = `glm-ocr-${Date.now()}.txt`;
746
+ a.click();
747
+ URL.revokeObjectURL(a.href);
748
+ });
749
+
750
+ // ── Toast ─────────────────────────────────────────────────────────────────
751
+ function toast(msg) {
752
+ const t = document.getElementById('toast');
753
+ t.textContent = msg;
754
+ t.classList.add('show');
755
+ setTimeout(() => t.classList.remove('show'), 2200);
756
+ }
757
+ </script>
758
+ </body>
759
+ </html>
generate_icons.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ generate_icons.py β€” Run this once to create the extension icons.
3
+ Requires Pillow: pip install Pillow
4
+ """
5
+ from PIL import Image, ImageDraw, ImageFont
6
+ import os
7
+
8
+ os.makedirs("icons", exist_ok=True)
9
+
10
+ def make_icon(size):
11
+ img = Image.new("RGBA", (size, size), (0, 0, 0, 0))
12
+ draw = ImageDraw.Draw(img)
13
+
14
+ # Background rounded rect
15
+ pad = size // 8
16
+ draw.rounded_rectangle(
17
+ [pad, pad, size - pad, size - pad],
18
+ radius=size // 5,
19
+ fill="#c94a1f"
20
+ )
21
+
22
+ # Letter "G" for GLM
23
+ font_size = int(size * 0.52)
24
+ try:
25
+ font = ImageFont.truetype("arial.ttf", font_size)
26
+ except:
27
+ font = ImageFont.load_default()
28
+
29
+ text = "G"
30
+ bbox = draw.textbbox((0, 0), text, font=font)
31
+ tw = bbox[2] - bbox[0]
32
+ th = bbox[3] - bbox[1]
33
+ tx = (size - tw) // 2 - bbox[0]
34
+ ty = (size - th) // 2 - bbox[1]
35
+ draw.text((tx, ty), text, fill="white", font=font)
36
+
37
+ img.save(f"icons/icon{size}.png")
38
+ print(f"Created icons/icon{size}.png")
39
+
40
+ for s in [16, 48, 128]:
41
+ make_icon(s)
42
+
43
+ print("Done. Icons created in icons/")
main.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ main.py β€” FastAPI server for zai-org/GLM-OCR
3
+
4
+ Endpoints:
5
+ GET / β†’ Serves the frontend HTML
6
+ GET /health β†’ Liveness probe + model info
7
+ POST /ocr β†’ Run OCR on uploaded image
8
+ GET /metrics β†’ Session-level stats
9
+ """
10
+
11
+ import logging
12
+ import time
13
+ from contextlib import asynccontextmanager
14
+ from pathlib import Path
15
+
16
+ import uvicorn
17
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile, Request
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from fastapi.responses import FileResponse, JSONResponse
20
+ from pydantic import BaseModel
21
+ from typing import Annotated
22
+
23
+ from ocr_engine import engine, OcrResult, OcrMode
24
+
25
+ # ── Logging ─────────────────────────────────────────────────────────────────
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format="%(asctime)s | %(levelname)-8s | %(name)s β€” %(message)s",
30
+ datefmt="%H:%M:%S",
31
+ )
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # ── Session metrics ─────────────────────────────────────────────────────────
35
+
36
+ class SessionMetrics:
37
+ def __init__(self):
38
+ self.total_requests = 0
39
+ self.total_words = 0
40
+ self.total_chars = 0
41
+ self.total_ms = 0.0
42
+ self.errors = 0
43
+ self.started_at = time.time()
44
+
45
+ def record(self, result: OcrResult):
46
+ self.total_requests += 1
47
+ self.total_words += result.word_count
48
+ self.total_chars += result.char_count
49
+ self.total_ms += result.latency_ms
50
+
51
+ def to_dict(self) -> dict:
52
+ avg = self.total_ms / self.total_requests if self.total_requests else 0
53
+ return {
54
+ "total_requests": self.total_requests,
55
+ "total_words_extracted": self.total_words,
56
+ "total_chars_extracted": self.total_chars,
57
+ "avg_latency_ms": round(avg, 1),
58
+ "error_count": self.errors,
59
+ "uptime_seconds": round(time.time() - self.started_at, 1),
60
+ }
61
+
62
+ metrics = SessionMetrics()
63
+
64
+ # ── Lifespan ─────────────────────────────────────────────────────────────────
65
+
66
+ @asynccontextmanager
67
+ async def lifespan(app: FastAPI):
68
+ logger.info("πŸš€ Starting up β€” loading GLM-OCR model …")
69
+ engine.load()
70
+ logger.info("βœ… Model ready.")
71
+ yield
72
+ logger.info("πŸ›‘ Shutting down …")
73
+ engine.unload()
74
+
75
+ # ── App ──────────────────────────────────────────────────────────────────────
76
+
77
+ app = FastAPI(
78
+ title="GLM-OCR API",
79
+ description="Self-hosted OCR backend powered by zai-org/GLM-OCR",
80
+ version="1.0.0",
81
+ lifespan=lifespan,
82
+ )
83
+
84
+ app.add_middleware(
85
+ CORSMiddleware,
86
+ allow_origins=["*"],
87
+ allow_methods=["GET", "POST"],
88
+ allow_headers=["*"],
89
+ )
90
+
91
+ # ── Schemas ───────────────────────────────────────────────────────────────────
92
+
93
+ class OcrResponse(BaseModel):
94
+ success: bool
95
+ text: str
96
+ word_count: int
97
+ char_count: int
98
+ latency_ms: float
99
+ mode: str
100
+ model_id: str
101
+ device: str
102
+
103
+ # ── Routes ────────────────────────────────────────────────────────────────────
104
+
105
+ @app.get("/", include_in_schema=False)
106
+ async def serve_frontend():
107
+ frontend = Path(__file__).parent / "frontend" / "index.html"
108
+ if not frontend.exists():
109
+ return JSONResponse({"message": "Frontend not found."}, 404)
110
+ return FileResponse(str(frontend))
111
+
112
+
113
+ @app.get("/health")
114
+ async def health():
115
+ return {
116
+ "status": "ok" if engine.loaded else "loading",
117
+ "model": engine.info,
118
+ }
119
+
120
+
121
+ @app.post("/ocr", response_model=OcrResponse)
122
+ async def run_ocr(
123
+ file: Annotated[UploadFile, File(description="Image file (PNG, JPG, WEBP, BMP, TIFF)")],
124
+ mode: Annotated[OcrMode, Form(description="'recognize' for plain text Β· 'parse' for structured markdown")] = "recognize",
125
+ ):
126
+ """
127
+ Run GLM-OCR on an uploaded image.
128
+
129
+ **mode options:**
130
+ - `recognize` β€” extracts raw text, preserves layout (default)
131
+ - `parse` β€” returns structured markdown (headers, tables, lists)
132
+ """
133
+ allowed = {"image/png", "image/jpeg", "image/webp", "image/gif", "image/bmp", "image/tiff"}
134
+ if file.content_type and file.content_type not in allowed:
135
+ raise HTTPException(status_code=415, detail=f"Unsupported file type: {file.content_type}")
136
+
137
+ image_bytes = await file.read()
138
+ if not image_bytes:
139
+ raise HTTPException(status_code=400, detail="Empty file.")
140
+ if len(image_bytes) > 20 * 1024 * 1024:
141
+ raise HTTPException(status_code=413, detail="File too large. Max 20 MB.")
142
+
143
+ logger.info(f"OCR | file={file.filename} size={len(image_bytes)/1024:.1f}KB mode={mode}")
144
+
145
+ try:
146
+ result = engine.run(image_bytes, mode=mode)
147
+ except ValueError as e:
148
+ metrics.errors += 1
149
+ raise HTTPException(status_code=422, detail=str(e))
150
+ except Exception as e:
151
+ metrics.errors += 1
152
+ logger.exception("Inference error")
153
+ raise HTTPException(status_code=500, detail=f"Inference failed: {e}")
154
+
155
+ metrics.record(result)
156
+ logger.info(f"Done | {result.word_count} words | {result.latency_ms:.0f}ms")
157
+
158
+ return OcrResponse(
159
+ success = True,
160
+ text = result.text,
161
+ word_count = result.word_count,
162
+ char_count = result.char_count,
163
+ latency_ms = result.latency_ms,
164
+ mode = result.mode,
165
+ model_id = result.model_id,
166
+ device = result.device,
167
+ )
168
+
169
+
170
+ @app.get("/metrics")
171
+ async def get_metrics():
172
+ return metrics.to_dict()
173
+
174
+
175
+ @app.exception_handler(Exception)
176
+ async def global_handler(request: Request, exc: Exception):
177
+ logger.exception(f"Unhandled: {request.url}")
178
+ return JSONResponse(status_code=500, content={"detail": "Internal server error"})
179
+
180
+
181
+ if __name__ == "__main__":
182
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)
manifest.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "manifest_version": 3,
3
+ "name": "GLM-OCR β€” Text from Screen",
4
+ "version": "1.0.0",
5
+ "description": "Select any region on screen and extract text using the self-hosted GLM-OCR model.",
6
+
7
+ "permissions": [
8
+ "activeTab",
9
+ "scripting",
10
+ "tabs",
11
+ "storage"
12
+ ],
13
+
14
+ "host_permissions": [
15
+ "http://localhost:8000/*",
16
+ "<all_urls>"
17
+ ],
18
+
19
+ "background": {
20
+ "service_worker": "background.js"
21
+ },
22
+
23
+ "action": {
24
+ "default_popup": "popup.html",
25
+ "default_icon": {
26
+ "16": "icons/icon16.png",
27
+ "48": "icons/icon48.png",
28
+ "128": "icons/icon128.png"
29
+ }
30
+ },
31
+
32
+ "content_scripts": [
33
+ {
34
+ "matches": ["<all_urls>"],
35
+ "js": ["content.js"],
36
+ "css": ["content.css"],
37
+ "run_at": "document_idle",
38
+ "all_frames": false
39
+ }
40
+ ],
41
+
42
+ "icons": {
43
+ "16": "icons/icon16.png",
44
+ "48": "icons/icon48.png",
45
+ "128": "icons/icon128.png"
46
+ },
47
+
48
+ "web_accessible_resources": [
49
+ {
50
+ "resources": ["sidebar.html"],
51
+ "matches": ["<all_urls>"]
52
+ }
53
+ ]
54
+ }
ocr_engine.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ocr_engine.py β€” zai-org/GLM-OCR inference module
3
+
4
+ GLM-OCR is a 0.9B multimodal OCR model built on the GLM-V encoder-decoder
5
+ architecture. It uses a CogViT visual encoder + GLM-0.5B language decoder,
6
+ trained with Multi-Token Prediction loss for high-quality document OCR.
7
+
8
+ Model: https://huggingface.co/zai-org/GLM-OCR
9
+ Paper: https://arxiv.org/abs/2603.10910
10
+ """
11
+
12
+ import io
13
+ import time
14
+ import logging
15
+ import tempfile
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Literal
19
+
20
+ import torch
21
+ import torch.nn.functional as F
22
+ from PIL import Image
23
+ from transformers import AutoProcessor, AutoModelForImageTextToText
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # ── Config ─────────────────────────────────────────────────────────────────
28
+
29
+ MODEL_ID = "zai-org/GLM-OCR"
30
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
+
32
+ # Two prompt modes supported by GLM-OCR:
33
+ # "recognize" β†’ "Text Recognition:" (extract raw text, preserves structure)
34
+ # "parse" β†’ "Document Parsing:" (structured markdown output)
35
+ OcrMode = Literal["recognize", "parse"]
36
+
37
+ PROMPTS = {
38
+ "recognize": "Text Recognition:",
39
+ "parse": "Document Parsing:",
40
+ }
41
+
42
+ # ── Result dataclass ────────────────────────────────────────────────────────
43
+
44
+ @dataclass
45
+ class OcrResult:
46
+ text: str
47
+ mode: str
48
+ word_count: int
49
+ char_count: int
50
+ latency_ms: float
51
+ device: str
52
+ model_id: str
53
+
54
+ # ── Engine ──────────────────────────────────────────────────────────────────
55
+
56
+ class GlmOcrEngine:
57
+ """
58
+ Wraps zai-org/GLM-OCR. Call .load() once at startup,
59
+ then .run(image_bytes, mode) per request.
60
+ """
61
+
62
+ def __init__(self):
63
+ self.model = None
64
+ self.processor = None
65
+ self.loaded = False
66
+
67
+ # ── Lifecycle ───────────────────────────────────────────────────────────
68
+
69
+ def load(self) -> None:
70
+ if self.loaded:
71
+ return
72
+
73
+ logger.info(f"Loading {MODEL_ID} on {DEVICE} …")
74
+ t0 = time.time()
75
+
76
+ self.processor = AutoProcessor.from_pretrained(
77
+ MODEL_ID,
78
+ trust_remote_code=True,
79
+ )
80
+
81
+ self.model = AutoModelForImageTextToText.from_pretrained(
82
+ MODEL_ID,
83
+ torch_dtype="auto", # fp16 on CUDA, fp32 on CPU
84
+ device_map="auto", # spreads across available devices
85
+ trust_remote_code=True,
86
+ )
87
+
88
+ # ── CPU patch: replace the slow Conv3d patch_embed with matmul ──────
89
+ # The default Conv3d produces ~22k individual 1x1x1 kernels on CPU
90
+ # which is catastrophically slow. This replaces it with a single F.linear
91
+ # call, bringing CPU inference from ~30min to ~30s per image.
92
+ # See: https://huggingface.co/zai-org/GLM-OCR/discussions/36
93
+ if DEVICE == "cpu":
94
+ self._apply_cpu_patch()
95
+
96
+ self.model.eval()
97
+ self.loaded = True
98
+ logger.info(f"Model loaded in {time.time() - t0:.1f}s")
99
+
100
+ def _apply_cpu_patch(self):
101
+ """Replace Conv3d patch_embed with matmul for fast CPU inference."""
102
+ try:
103
+ base_model = self.model.model if hasattr(self.model, 'model') else self.model
104
+ patch_embed = base_model.visual.patch_embed
105
+ proj = patch_embed.proj
106
+
107
+ in_features = (
108
+ patch_embed.in_channels *
109
+ patch_embed.temporal_patch_size *
110
+ patch_embed.patch_size ** 2
111
+ )
112
+ embed_dim = patch_embed.embed_dim
113
+ weight = proj.weight
114
+ bias = proj.bias
115
+
116
+ def _fast_forward(hidden_states: torch.Tensor) -> torch.Tensor:
117
+ target_dtype = weight.dtype
118
+ hidden_states = hidden_states.reshape(-1, in_features).to(dtype=target_dtype)
119
+ return F.linear(hidden_states, weight.reshape(embed_dim, -1), bias)
120
+
121
+ patch_embed.forward = _fast_forward
122
+ logger.info("CPU matmul patch applied to patch_embed.")
123
+ except Exception as e:
124
+ logger.warning(f"Could not apply CPU patch (will still work, just slower): {e}")
125
+
126
+ def unload(self) -> None:
127
+ if self.model:
128
+ del self.model
129
+ del self.processor
130
+ self.model = None
131
+ self.processor = None
132
+ self.loaded = False
133
+ if torch.cuda.is_available():
134
+ torch.cuda.empty_cache()
135
+ logger.info("Model unloaded.")
136
+
137
+ # ── Inference ───────────────────────────────────────────────────────────
138
+
139
+ def run(self, image_bytes: bytes, mode: OcrMode = "recognize") -> OcrResult:
140
+ """
141
+ Run GLM-OCR on raw image bytes.
142
+
143
+ Args:
144
+ image_bytes: Raw bytes of the uploaded image.
145
+ mode:
146
+ 'recognize' β†’ plain text extraction ("Text Recognition:")
147
+ 'parse' β†’ structured markdown output ("Document Parsing:")
148
+
149
+ Returns:
150
+ OcrResult with extracted text and metadata.
151
+ """
152
+ if not self.loaded:
153
+ raise RuntimeError("Engine not loaded. Call .load() first.")
154
+
155
+ # Validate image
156
+ img = self._validate_image(image_bytes)
157
+
158
+ # Save to temp file β€” processor loads from path/URL
159
+ tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
160
+ img.save(tmp.name, format="PNG")
161
+ tmp.close()
162
+
163
+ prompt_text = PROMPTS[mode]
164
+
165
+ messages = [
166
+ {
167
+ "role": "user",
168
+ "content": [
169
+ {"type": "image", "url": tmp.name},
170
+ {"type": "text", "text": prompt_text},
171
+ ],
172
+ }
173
+ ]
174
+
175
+ t0 = time.time()
176
+ try:
177
+ inputs = self.processor.apply_chat_template(
178
+ messages,
179
+ tokenize=True,
180
+ add_generation_prompt=True,
181
+ return_dict=True,
182
+ return_tensors="pt",
183
+ ).to(self.model.device)
184
+
185
+ # token_type_ids not used by this model
186
+ inputs.pop("token_type_ids", None)
187
+
188
+ with torch.inference_mode():
189
+ generated_ids = self.model.generate(
190
+ **inputs,
191
+ max_new_tokens=8192,
192
+ )
193
+
194
+ # Decode only the newly generated tokens
195
+ output_text = self.processor.decode(
196
+ generated_ids[0][inputs["input_ids"].shape[1]:],
197
+ skip_special_tokens=False,
198
+ )
199
+ finally:
200
+ Path(tmp.name).unlink(missing_ok=True)
201
+
202
+ latency_ms = (time.time() - t0) * 1000
203
+ text = output_text.strip() if output_text else ""
204
+
205
+ return OcrResult(
206
+ text = text,
207
+ mode = mode,
208
+ word_count = len(text.split()) if text else 0,
209
+ char_count = len(text),
210
+ latency_ms = round(latency_ms, 1),
211
+ device = str(next(self.model.parameters()).device),
212
+ model_id = MODEL_ID,
213
+ )
214
+
215
+ # ── Helpers ─────────────────────────────────────────────────────────────
216
+
217
+ @staticmethod
218
+ def _validate_image(image_bytes: bytes) -> Image.Image:
219
+ try:
220
+ img = Image.open(io.BytesIO(image_bytes))
221
+ img.verify()
222
+ img = Image.open(io.BytesIO(image_bytes))
223
+ return img.convert("RGB")
224
+ except Exception as e:
225
+ raise ValueError(f"Invalid image: {e}") from e
226
+
227
+ @property
228
+ def info(self) -> dict:
229
+ return {
230
+ "model_id": MODEL_ID,
231
+ "device": DEVICE,
232
+ "loaded": self.loaded,
233
+ "cuda_available": torch.cuda.is_available(),
234
+ "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
235
+ "gpu_memory_gb": round(
236
+ torch.cuda.get_device_properties(0).total_memory / 1e9, 1
237
+ ) if torch.cuda.is_available() else None,
238
+ }
239
+
240
+
241
+ # ── Singleton ───────────────────────────────────────────────────────────────
242
+ engine = GlmOcrEngine()
popup.html ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>GLM-OCR</title>
7
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;700&family=DM+Serif+Display:ital@0;1&family=DM+Sans:wght@400;500&display=swap" rel="stylesheet"/>
8
+ <style>
9
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
10
+ :root {
11
+ --ink: #0f0e0d;
12
+ --paper: #f5f0e8;
13
+ --warm: #ede8dc;
14
+ --border: #d4cfc3;
15
+ --muted: #8f8880;
16
+ --accent: #c94a1f;
17
+ --green: #1a6b4a;
18
+ --mono: 'IBM Plex Mono', monospace;
19
+ --serif: 'DM Serif Display', serif;
20
+ --sans: 'DM Sans', sans-serif;
21
+ }
22
+
23
+ body {
24
+ width: 300px;
25
+ background: var(--paper);
26
+ color: var(--ink);
27
+ font-family: var(--sans);
28
+ }
29
+
30
+ body::before {
31
+ content: '';
32
+ position: fixed; inset: 0;
33
+ background-image: radial-gradient(circle, rgba(0,0,0,0.05) 1px, transparent 1px);
34
+ background-size: 16px 16px;
35
+ pointer-events: none;
36
+ }
37
+
38
+ .inner { position: relative; }
39
+
40
+ /* Header */
41
+ .header {
42
+ padding: 16px 18px 14px;
43
+ border-bottom: 2px solid var(--ink);
44
+ display: flex;
45
+ align-items: center;
46
+ justify-content: space-between;
47
+ }
48
+
49
+ .logo {
50
+ font-family: var(--serif);
51
+ font-size: 1.1rem;
52
+ letter-spacing: -0.01em;
53
+ }
54
+
55
+ .logo em { font-style: italic; color: var(--accent); }
56
+
57
+ .server-badge {
58
+ display: flex;
59
+ align-items: center;
60
+ gap: 5px;
61
+ font-family: var(--mono);
62
+ font-size: 0.58rem;
63
+ color: var(--muted);
64
+ letter-spacing: 0.06em;
65
+ }
66
+
67
+ .dot {
68
+ width: 6px; height: 6px;
69
+ border-radius: 50%;
70
+ background: var(--muted);
71
+ }
72
+ .dot.ok { background: var(--green); }
73
+ .dot.err { background: var(--accent); }
74
+ .dot.pulse { animation: blink 1.2s ease-in-out infinite; }
75
+ @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} }
76
+
77
+ /* Main CTA */
78
+ .cta-area {
79
+ padding: 20px 18px;
80
+ border-bottom: 1px solid var(--border);
81
+ }
82
+
83
+ .cta-label {
84
+ font-family: var(--mono);
85
+ font-size: 0.62rem;
86
+ color: var(--muted);
87
+ letter-spacing: 0.1em;
88
+ text-transform: uppercase;
89
+ margin-bottom: 10px;
90
+ }
91
+
92
+ .select-btn {
93
+ width: 100%;
94
+ padding: 14px;
95
+ background: var(--accent);
96
+ color: white;
97
+ border: none;
98
+ border-radius: 2px;
99
+ font-family: var(--serif);
100
+ font-size: 1rem;
101
+ cursor: pointer;
102
+ transition: background 0.15s;
103
+ display: flex;
104
+ align-items: center;
105
+ justify-content: center;
106
+ gap: 8px;
107
+ }
108
+
109
+ .select-btn:hover:not(:disabled) { background: #b53d15; }
110
+ .select-btn:disabled { opacity: 0.35; cursor: not-allowed; }
111
+
112
+ .select-btn .shortcut {
113
+ font-family: var(--mono);
114
+ font-size: 0.6rem;
115
+ opacity: 0.7;
116
+ margin-left: auto;
117
+ }
118
+
119
+ .offline-msg {
120
+ display: none;
121
+ margin-top: 10px;
122
+ font-family: var(--mono);
123
+ font-size: 0.65rem;
124
+ color: var(--accent);
125
+ line-height: 1.6;
126
+ }
127
+
128
+ .offline-msg.show { display: block; }
129
+
130
+ .offline-msg a {
131
+ color: var(--accent);
132
+ text-decoration: underline;
133
+ }
134
+
135
+ /* How it works */
136
+ .how {
137
+ padding: 16px 18px;
138
+ border-bottom: 1px solid var(--border);
139
+ }
140
+
141
+ .how-title {
142
+ font-family: var(--mono);
143
+ font-size: 0.6rem;
144
+ color: var(--muted);
145
+ letter-spacing: 0.1em;
146
+ text-transform: uppercase;
147
+ margin-bottom: 12px;
148
+ }
149
+
150
+ .step {
151
+ display: flex;
152
+ gap: 10px;
153
+ align-items: flex-start;
154
+ margin-bottom: 8px;
155
+ }
156
+
157
+ .step:last-child { margin-bottom: 0; }
158
+
159
+ .step-num {
160
+ font-family: var(--mono);
161
+ font-size: 0.6rem;
162
+ color: var(--accent);
163
+ font-weight: 700;
164
+ flex-shrink: 0;
165
+ margin-top: 2px;
166
+ }
167
+
168
+ .step-text {
169
+ font-size: 0.78rem;
170
+ line-height: 1.5;
171
+ color: var(--ink);
172
+ }
173
+
174
+ /* Settings */
175
+ .settings {
176
+ padding: 14px 18px;
177
+ }
178
+
179
+ .settings-title {
180
+ font-family: var(--mono);
181
+ font-size: 0.6rem;
182
+ color: var(--muted);
183
+ letter-spacing: 0.1em;
184
+ text-transform: uppercase;
185
+ margin-bottom: 10px;
186
+ }
187
+
188
+ .setting-row {
189
+ display: flex;
190
+ align-items: center;
191
+ justify-content: space-between;
192
+ margin-bottom: 8px;
193
+ }
194
+
195
+ .setting-label {
196
+ font-family: var(--mono);
197
+ font-size: 0.68rem;
198
+ color: var(--ink);
199
+ }
200
+
201
+ .mode-toggle {
202
+ display: flex;
203
+ gap: 4px;
204
+ }
205
+
206
+ .mode-opt {
207
+ font-family: var(--mono);
208
+ font-size: 0.58rem;
209
+ padding: 4px 8px;
210
+ border: 1px solid var(--border);
211
+ background: transparent;
212
+ color: var(--muted);
213
+ cursor: pointer;
214
+ border-radius: 2px;
215
+ transition: all 0.12s;
216
+ }
217
+
218
+ .mode-opt.active {
219
+ background: var(--ink);
220
+ border-color: var(--ink);
221
+ color: var(--paper);
222
+ }
223
+
224
+ /* Footer */
225
+ .footer {
226
+ padding: 10px 18px;
227
+ border-top: 1px solid var(--border);
228
+ font-family: var(--mono);
229
+ font-size: 0.58rem;
230
+ color: var(--muted);
231
+ display: flex;
232
+ justify-content: space-between;
233
+ }
234
+ </style>
235
+ </head>
236
+ <body>
237
+ <div class="inner">
238
+
239
+ <!-- Header -->
240
+ <div class="header">
241
+ <div class="logo">GLM-<em>OCR</em></div>
242
+ <div class="server-badge">
243
+ <div class="dot pulse" id="dot"></div>
244
+ <span id="server-label">checking…</span>
245
+ </div>
246
+ </div>
247
+
248
+ <!-- CTA -->
249
+ <div class="cta-area">
250
+ <div class="cta-label">Select region on screen</div>
251
+ <button class="select-btn" id="select-btn" disabled>
252
+ βœ‚ &nbsp;Select & Extract Text
253
+ </button>
254
+ <div class="offline-msg" id="offline-msg">
255
+ ⚠ GLM-OCR server not running.<br>
256
+ Start it with <code>python main.py</code> at <a href="http://localhost:8000" target="_blank">localhost:8000</a>.
257
+ </div>
258
+ </div>
259
+
260
+ <!-- How it works -->
261
+ <div class="how">
262
+ <div class="how-title">How it works</div>
263
+ <div class="step">
264
+ <div class="step-num">01</div>
265
+ <div class="step-text">Click the button above β€” page dims</div>
266
+ </div>
267
+ <div class="step">
268
+ <div class="step-num">02</div>
269
+ <div class="step-text">Drag a box around the text you want</div>
270
+ </div>
271
+ <div class="step">
272
+ <div class="step-num">03</div>
273
+ <div class="step-text">GLM-OCR extracts text into a sidebar</div>
274
+ </div>
275
+ <div class="step">
276
+ <div class="step-num">04</div>
277
+ <div class="step-text">Copy or download the result</div>
278
+ </div>
279
+ </div>
280
+
281
+ <!-- Settings -->
282
+ <div class="settings">
283
+ <div class="settings-title">Settings</div>
284
+ <div class="setting-row">
285
+ <span class="setting-label">OCR Mode</span>
286
+ <div class="mode-toggle">
287
+ <button class="mode-opt active" data-mode="recognize">recognize</button>
288
+ <button class="mode-opt" data-mode="parse">parse</button>
289
+ </div>
290
+ </div>
291
+ </div>
292
+
293
+ <div class="footer">
294
+ <span>zai-org/GLM-OCR Β· 0.9B</span>
295
+ <span>self-hosted</span>
296
+ </div>
297
+
298
+ </div>
299
+
300
+ <script src="popup.js"></script>
301
+ </body>
302
+ </html>
popup.js ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // popup.js
2
+
3
+ const selectBtn = document.getElementById("select-btn");
4
+ const dot = document.getElementById("dot");
5
+ const serverLabel = document.getElementById("server-label");
6
+ const offlineMsg = document.getElementById("offline-msg");
7
+
8
+ let selectedMode = "recognize";
9
+
10
+ // ── Check server health ───────────────────────────────────────────────────────
11
+ async function checkServer() {
12
+ try {
13
+ const r = await fetch("http://localhost:8000/health", {
14
+ signal: AbortSignal.timeout(3000),
15
+ });
16
+ const d = await r.json();
17
+ return d.status === "ok";
18
+ } catch {
19
+ return false;
20
+ }
21
+ }
22
+
23
+ async function updateServerStatus() {
24
+ const ok = await checkServer();
25
+ dot.className = `dot ${ok ? "ok" : "err"}`;
26
+ serverLabel.textContent = ok ? "server ready" : "offline";
27
+ selectBtn.disabled = !ok;
28
+ offlineMsg.classList.toggle("show", !ok);
29
+ }
30
+
31
+ updateServerStatus();
32
+
33
+ // ── Mode toggle ───────────────────────────────────────────────────────────────
34
+ document.querySelectorAll(".mode-opt").forEach(btn => {
35
+ btn.addEventListener("click", () => {
36
+ document.querySelectorAll(".mode-opt").forEach(b => b.classList.remove("active"));
37
+ btn.classList.add("active");
38
+ selectedMode = btn.dataset.mode;
39
+ chrome.storage.local.set({ ocrMode: selectedMode });
40
+ });
41
+ });
42
+
43
+ // Restore saved mode
44
+ chrome.storage.local.get(["ocrMode"], ({ ocrMode }) => {
45
+ if (ocrMode) {
46
+ selectedMode = ocrMode;
47
+ document.querySelectorAll(".mode-opt").forEach(btn => {
48
+ btn.classList.toggle("active", btn.dataset.mode === ocrMode);
49
+ });
50
+ }
51
+ });
52
+
53
+ // ── Select button ─────────────────────────────────────────────────────────────
54
+ selectBtn.addEventListener("click", async () => {
55
+ // Save current mode to storage so background can read it
56
+ await chrome.storage.local.set({ ocrMode: selectedMode });
57
+
58
+ // Get current tab and inject the selection
59
+ const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
60
+
61
+ await chrome.scripting.executeScript({
62
+ target: { tabId: tab.id },
63
+ func: () => {
64
+ window.postMessage({ type: "GLMOCR_START" }, "*");
65
+ },
66
+ });
67
+
68
+ // Tell content script to start selection mode
69
+ chrome.tabs.sendMessage(tab.id, { type: "START_SELECTION" });
70
+
71
+ // Close popup so it doesn't obscure the page
72
+ window.close();
73
+ });
requirements.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GLM-OCR Backend β€” Python dependencies
2
+ # Install: pip install -r requirements.txt
3
+
4
+ # Web framework
5
+ fastapi==0.115.5
6
+ uvicorn[standard]==0.32.1
7
+ python-multipart>=0.0.12
8
+
9
+ # GLM-OCR requires transformers >= 5.3.0
10
+ # Install latest directly from GitHub to be safe:
11
+ # pip install git+https://github.com/huggingface/transformers.git
12
+ transformers>=5.3.0
13
+
14
+ # ML
15
+ torch>=2.2.0
16
+ torchvision>=0.17.0
17
+ accelerate>=1.1.0
18
+
19
+ # Image
20
+ Pillow>=10.4.0
21
+
22
+ # Misc
23
+ pydantic>=2.9.0
24
+ safetensors>=0.4.5
25
+ einops>=0.8.0
26
+ sentencepiece>=0.2.0
27
+ tiktoken>=0.7.0
sidebar.html ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>GLM-OCR Result</title>
7
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;700&family=DM+Serif+Display:ital@0;1&family=DM+Sans:wght@400;500&display=swap" rel="stylesheet"/>
8
+ <style>
9
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
10
+ :root {
11
+ --ink: #0f0e0d;
12
+ --paper: #f5f0e8;
13
+ --warm: #ede8dc;
14
+ --border: #d4cfc3;
15
+ --muted: #8f8880;
16
+ --accent: #c94a1f;
17
+ --green: #1a6b4a;
18
+ --mono: 'IBM Plex Mono', monospace;
19
+ --serif: 'DM Serif Display', serif;
20
+ --sans: 'DM Sans', sans-serif;
21
+ }
22
+
23
+ html, body {
24
+ height: 100%;
25
+ background: var(--paper);
26
+ color: var(--ink);
27
+ font-family: var(--sans);
28
+ overflow: hidden;
29
+ }
30
+
31
+ body::before {
32
+ content: '';
33
+ position: fixed; inset: 0;
34
+ background-image: radial-gradient(circle, rgba(0,0,0,0.05) 1px, transparent 1px);
35
+ background-size: 16px 16px;
36
+ pointer-events: none;
37
+ }
38
+
39
+ .sidebar {
40
+ position: relative;
41
+ height: 100vh;
42
+ display: flex;
43
+ flex-direction: column;
44
+ }
45
+
46
+ /* ── Header ── */
47
+ .sb-header {
48
+ padding: 14px 16px;
49
+ border-bottom: 2px solid var(--ink);
50
+ display: flex;
51
+ align-items: center;
52
+ justify-content: space-between;
53
+ flex-shrink: 0;
54
+ }
55
+
56
+ .sb-title {
57
+ font-family: var(--serif);
58
+ font-size: 1rem;
59
+ letter-spacing: -0.01em;
60
+ }
61
+
62
+ .sb-title em { font-style: italic; color: var(--accent); }
63
+
64
+ .sb-close {
65
+ font-family: var(--mono);
66
+ font-size: 0.6rem;
67
+ padding: 5px 10px;
68
+ border: 1px solid var(--border);
69
+ background: transparent;
70
+ cursor: pointer;
71
+ border-radius: 2px;
72
+ color: var(--muted);
73
+ transition: all 0.12s;
74
+ }
75
+
76
+ .sb-close:hover { border-color: var(--ink); color: var(--ink); }
77
+
78
+ /* ── Scrollable body ── */
79
+ .sb-body {
80
+ flex: 1;
81
+ overflow-y: auto;
82
+ display: flex;
83
+ flex-direction: column;
84
+ }
85
+
86
+ /* ── Loading ── */
87
+ .sb-loading {
88
+ flex: 1;
89
+ display: flex;
90
+ flex-direction: column;
91
+ align-items: center;
92
+ justify-content: center;
93
+ gap: 16px;
94
+ padding: 24px;
95
+ }
96
+
97
+ .scan-bar-wrap { width: 140px; height: 3px; background: var(--border); border-radius: 2px; overflow: hidden; }
98
+ .scan-bar { height: 100%; background: var(--accent); border-radius: 2px; animation: scan 1.4s ease-in-out infinite; }
99
+ @keyframes scan { 0%{transform:translateX(-100%)} 50%{transform:translateX(0)} 100%{transform:translateX(100%)} }
100
+
101
+ .loading-label {
102
+ font-family: var(--mono);
103
+ font-size: 0.68rem;
104
+ color: var(--muted);
105
+ animation: blink 1.4s ease-in-out infinite;
106
+ }
107
+
108
+ @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} }
109
+
110
+ /* ── Error ── */
111
+ .sb-error {
112
+ margin: 16px;
113
+ background: #fff0f0;
114
+ border: 1px solid rgba(201,74,31,0.3);
115
+ border-radius: 2px;
116
+ padding: 14px;
117
+ font-family: var(--mono);
118
+ font-size: 0.72rem;
119
+ color: var(--accent);
120
+ line-height: 1.7;
121
+ }
122
+
123
+ /* ── Image preview ── */
124
+ .sb-image-wrap {
125
+ padding: 14px 16px 0;
126
+ flex-shrink: 0;
127
+ }
128
+
129
+ .sb-image-label {
130
+ font-family: var(--mono);
131
+ font-size: 0.58rem;
132
+ color: var(--muted);
133
+ letter-spacing: 0.1em;
134
+ text-transform: uppercase;
135
+ margin-bottom: 8px;
136
+ }
137
+
138
+ .sb-image {
139
+ width: 100%;
140
+ max-height: 160px;
141
+ object-fit: contain;
142
+ border: 1px solid var(--border);
143
+ border-radius: 2px;
144
+ background: var(--warm);
145
+ }
146
+
147
+ /* ── Meta chips ── */
148
+ .sb-meta {
149
+ padding: 10px 16px;
150
+ display: flex;
151
+ gap: 10px;
152
+ flex-wrap: wrap;
153
+ border-bottom: 1px solid var(--border);
154
+ flex-shrink: 0;
155
+ }
156
+
157
+ .chip {
158
+ font-family: var(--mono);
159
+ font-size: 0.6rem;
160
+ color: var(--muted);
161
+ }
162
+
163
+ .chip strong { color: var(--green); }
164
+
165
+ /* ── Extracted text ── */
166
+ .sb-text-section {
167
+ padding: 14px 16px;
168
+ display: flex;
169
+ flex-direction: column;
170
+ gap: 8px;
171
+ flex: 1;
172
+ min-height: 0;
173
+ }
174
+
175
+ .sb-text-label {
176
+ font-family: var(--mono);
177
+ font-size: 0.58rem;
178
+ color: var(--muted);
179
+ letter-spacing: 0.1em;
180
+ text-transform: uppercase;
181
+ flex-shrink: 0;
182
+ }
183
+
184
+ .sb-text {
185
+ background: var(--warm);
186
+ border: 1px solid var(--border);
187
+ border-radius: 2px;
188
+ padding: 14px;
189
+ font-family: var(--mono);
190
+ font-size: 0.78rem;
191
+ line-height: 1.85;
192
+ white-space: pre-wrap;
193
+ word-break: break-word;
194
+ overflow-y: auto;
195
+ flex: 1;
196
+ min-height: 120px;
197
+ }
198
+
199
+ /* ── Actions ── */
200
+ .sb-actions {
201
+ padding: 12px 16px;
202
+ border-top: 1px solid var(--border);
203
+ display: flex;
204
+ gap: 8px;
205
+ flex-shrink: 0;
206
+ }
207
+
208
+ .action-btn {
209
+ font-family: var(--mono);
210
+ font-size: 0.62rem;
211
+ letter-spacing: 0.04em;
212
+ padding: 9px 12px;
213
+ border: 1px solid var(--border);
214
+ background: transparent;
215
+ color: var(--ink);
216
+ cursor: pointer;
217
+ border-radius: 2px;
218
+ transition: all 0.12s;
219
+ flex: 1;
220
+ }
221
+
222
+ .action-btn:hover { border-color: var(--ink); }
223
+
224
+ .action-btn.primary {
225
+ background: var(--accent);
226
+ border-color: var(--accent);
227
+ color: white;
228
+ }
229
+
230
+ .action-btn.primary:hover { background: #b53d15; }
231
+
232
+ /* ── Toast ── */
233
+ .toast {
234
+ position: fixed;
235
+ bottom: 16px;
236
+ left: 50%;
237
+ transform: translateX(-50%) translateY(40px);
238
+ opacity: 0;
239
+ background: var(--ink);
240
+ color: var(--paper);
241
+ font-family: var(--mono);
242
+ font-size: 0.65rem;
243
+ padding: 8px 16px;
244
+ border-radius: 2px;
245
+ white-space: nowrap;
246
+ transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
247
+ z-index: 999;
248
+ }
249
+
250
+ .toast.show {
251
+ transform: translateX(-50%) translateY(0);
252
+ opacity: 1;
253
+ }
254
+ </style>
255
+ </head>
256
+ <body>
257
+ <div class="sidebar">
258
+
259
+ <!-- Header -->
260
+ <div class="sb-header">
261
+ <div class="sb-title">GLM-<em>OCR</em> Result</div>
262
+ <button class="sb-close" id="close-btn">βœ• Close</button>
263
+ </div>
264
+
265
+ <!-- Body -->
266
+ <div class="sb-body" id="sb-body">
267
+
268
+ <!-- Loading state (default) -->
269
+ <div class="sb-loading" id="state-loading">
270
+ <div class="scan-bar-wrap"><div class="scan-bar"></div></div>
271
+ <div class="loading-label">Running GLM-OCR…</div>
272
+ </div>
273
+
274
+ </div>
275
+
276
+ <!-- Actions (shown after result) -->
277
+ <div class="sb-actions" id="sb-actions" style="display:none">
278
+ <button class="action-btn primary" id="new-btn">βœ‚ New Selection</button>
279
+ <button class="action-btn" id="copy-btn">Copy</button>
280
+ <button class="action-btn" id="dl-btn">↓ .txt</button>
281
+ </div>
282
+
283
+ </div>
284
+
285
+ <div class="toast" id="toast"></div>
286
+
287
+ <script>
288
+ let extractedText = "";
289
+
290
+ // ── Receive data from content.js ──────────────────────────────────────────
291
+ window.addEventListener("message", (e) => {
292
+ if (e.data?.type !== "SIDEBAR_DATA") return;
293
+ const data = e.data.data;
294
+
295
+ if (data.loading) return; // already showing loading state
296
+
297
+ renderResult(data);
298
+ });
299
+
300
+ function renderResult(data) {
301
+ const body = document.getElementById("sb-body");
302
+ const actions = document.getElementById("sb-actions");
303
+
304
+ if (data.error) {
305
+ body.innerHTML = `<div class="sb-error">⚠ ${data.error}<br><br>Make sure the GLM-OCR server is running at localhost:8000.</div>`;
306
+ actions.style.display = "flex";
307
+ return;
308
+ }
309
+
310
+ extractedText = data.text || "";
311
+
312
+ const latency = data.latency_ms ? `${(data.latency_ms / 1000).toFixed(2)}s` : "β€”";
313
+
314
+ body.innerHTML = `
315
+ <div class="sb-image-wrap">
316
+ <div class="sb-image-label">Selected Region</div>
317
+ <img class="sb-image" src="${data.imageDataUrl || ''}" alt="Selection"/>
318
+ </div>
319
+
320
+ <div class="sb-meta">
321
+ <span class="chip">words: <strong>${data.word_count || 0}</strong></span>
322
+ <span class="chip">chars: <strong>${data.char_count || 0}</strong></span>
323
+ <span class="chip">latency: <strong>${latency}</strong></span>
324
+ <span class="chip">device: <strong>${data.device || 'β€”'}</strong></span>
325
+ </div>
326
+
327
+ <div class="sb-text-section">
328
+ <div class="sb-text-label">Extracted Text</div>
329
+ <div class="sb-text" id="result-text">${data.text ? escapeHtml(data.text) : '<span style="color:var(--muted);">[No text detected]</span>'}</div>
330
+ </div>
331
+ `;
332
+
333
+ actions.style.display = "flex";
334
+ }
335
+
336
+ function escapeHtml(str) {
337
+ return str
338
+ .replace(/&/g, "&amp;")
339
+ .replace(/</g, "&lt;")
340
+ .replace(/>/g, "&gt;");
341
+ }
342
+
343
+ // ── Close ─────────────────────────────────────────────────────────────────
344
+ document.getElementById("close-btn").addEventListener("click", () => {
345
+ window.parent.postMessage({ type: "CLOSE_SIDEBAR" }, "*");
346
+ });
347
+
348
+ // ── New selection ─────────────────────────────────────────────────────────
349
+ document.getElementById("new-btn").addEventListener("click", () => {
350
+ window.parent.postMessage({ type: "START_NEW_SELECTION" }, "*");
351
+ });
352
+
353
+ // ── Copy ───────────────────────────────────────────────────────────��──────
354
+ document.getElementById("copy-btn").addEventListener("click", async () => {
355
+ try {
356
+ await navigator.clipboard.writeText(extractedText);
357
+ toast("Copied!");
358
+ } catch {
359
+ // fallback: select all text in the result box
360
+ const el = document.getElementById("result-text");
361
+ if (el) {
362
+ const range = document.createRange();
363
+ range.selectNodeContents(el);
364
+ const sel = window.getSelection();
365
+ sel.removeAllRanges();
366
+ sel.addRange(range);
367
+ }
368
+ toast("Select text above and copy manually.");
369
+ }
370
+ });
371
+
372
+ // ── Download ──────────────────────────────────────────────────────────────
373
+ document.getElementById("dl-btn").addEventListener("click", () => {
374
+ const blob = new Blob([extractedText], { type: "text/plain" });
375
+ const a = document.createElement("a");
376
+ a.href = URL.createObjectURL(blob);
377
+ a.download = `glm-ocr-${Date.now()}.txt`;
378
+ a.click();
379
+ URL.revokeObjectURL(a.href);
380
+ });
381
+
382
+ // ── Toast ─────────────────────────────────────────────────────────────────
383
+ function toast(msg) {
384
+ const t = document.getElementById("toast");
385
+ t.textContent = msg;
386
+ t.classList.add("show");
387
+ setTimeout(() => t.classList.remove("show"), 2000);
388
+ }
389
+ </script>
390
+ </body>
391
+ </html>