elvin.v.mammadov commited on
Commit
70085b1
·
1 Parent(s): 9ba2642

Added app.py

Browse files
Files changed (2) hide show
  1. .gitattributes +1 -0
  2. app.py +169 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import json
4
+ from jiwer import cer, wer
5
+ import re
6
+
7
+ pdf_file_path = 'dummy.pdf'
8
+
9
+ with open("page_transcriptions.json", encoding="utf-8") as f:
10
+
11
+ data = json.load(f)
12
+
13
+ def send_request(url):
14
+
15
+ try:
16
+
17
+ with open(pdf_file_path, 'rb') as pdf_file:
18
+
19
+ files = {
20
+ 'file': (
21
+ 'dummy.pdf',
22
+ pdf_file,
23
+ 'application/pdf'
24
+ )
25
+ }
26
+
27
+ response = requests.post(url, files=files)
28
+
29
+ except Exception as e:
30
+
31
+ return {"Error message: "f"Error occurred while sending request. Error message: {e}"}
32
+
33
+ try:
34
+
35
+ response_json = response.json()
36
+
37
+ except Exception as e:
38
+
39
+ return {
40
+ "Error message": e,
41
+ "Response": response.content
42
+ }
43
+
44
+ if isinstance(response_json, list):
45
+
46
+ for page in response_json:
47
+
48
+ if isinstance(page, dict):
49
+
50
+ if "page_number" not in page.keys() or "MD_text" not in page.keys():
51
+
52
+ return {
53
+ "Error message": "Response is not in desired structure. Desired structure: [{'page_number': 1, 'MD_text': 'Extracted text'}]",
54
+ "Response": response_json
55
+ }
56
+
57
+ if isinstance(page["page_number"], int) and isinstance(page["MD_text"], str):
58
+
59
+ continue
60
+
61
+ else:
62
+
63
+ return {
64
+ "Error message": "'page_number' should be integer and 'MD_text' should be string.",
65
+ "Response": response_json
66
+ }
67
+
68
+ else:
69
+
70
+ return {
71
+ "Error message": "List should include only dictionaries.",
72
+ "Response": response_json
73
+ }
74
+
75
+ if len(response_json) != len(data):
76
+
77
+ return {
78
+ "Error message": "The number of pages are not equal between transcription and ground truth.",
79
+ "Response": response_json
80
+ }
81
+
82
+ final_metrics = []
83
+ total_reference = ""
84
+ total_hypothesis = ""
85
+
86
+ for page in response_json:
87
+
88
+ for transcription in data:
89
+
90
+ if page["page_number"] == transcription["page_number"]:
91
+
92
+ reference = transcription['MD_text'].strip()
93
+ hypothesis = page['MD_text'].strip()
94
+
95
+ reference = reference.lower()
96
+ hypothesis = hypothesis.lower()
97
+
98
+ reference = reference.replace("\n", " ")
99
+ hypothesis = hypothesis.replace("\n", " ")
100
+
101
+ reference = re.sub(r'\s+', ' ', reference)
102
+ hypothesis = re.sub(r'\s+', ' ', hypothesis)
103
+
104
+ total_reference += reference
105
+ total_reference += " "
106
+ total_hypothesis += hypothesis
107
+ total_hypothesis += " "
108
+
109
+ cer_value = max(1 - cer(reference, hypothesis), 0)
110
+ wer_value = max(1 - wer(reference, hypothesis), 0)
111
+
112
+ final_metrics.append({"page_number": page["page_number"], "Character Success Rate (CSR)": round(cer_value, 4), "Word Success Rate (WSR)": round(wer_value, 4), "MD_text_used_for_metrics": hypothesis, "Ground_Truth_used_for_metrics": reference})
113
+
114
+ global_cer = max(1 - cer(total_reference.strip(), total_hypothesis.strip()), 0)
115
+ global_wer = max(1 - wer(total_reference.strip(), total_hypothesis.strip()), 0)
116
+
117
+ final_metrics.append({"Global CSR": global_cer, "Global WSR": global_wer, "MD_text_used_for_metrics": total_hypothesis.strip(), "Ground_Truth_used_for_metrics": total_reference.strip()})
118
+
119
+ return final_metrics
120
+
121
+ else:
122
+
123
+ return {
124
+ "Error message": "Response should be list of dictionaries.",
125
+ "Response": response_json
126
+ }
127
+
128
+ with gr.Blocks() as demo:
129
+
130
+ gr.Markdown(
131
+ """
132
+ # OCR Endpoint Response Validator and Quality Checker
133
+
134
+ Character Success Rate (CSR) and Word Success Rate (WSR) are metrics that will be provided for each page and total.
135
+ They are calculated by simply subtracting CER and WER from 1 respectively.
136
+ If CER or WER is > 1, CSR or WSR is considered as 0.
137
+
138
+ Enter your endpoint below and click **Send** to get the result.
139
+
140
+ Format:
141
+ ```http://<host>/<endpoint>```
142
+ """
143
+ )
144
+
145
+ output = gr.JSON(
146
+ label="Output"
147
+ )
148
+
149
+ input_box = gr.Textbox(
150
+ label="Input",
151
+ lines=1,
152
+ placeholder="Type your text here..."
153
+ )
154
+
155
+ send_btn = gr.Button("Send")
156
+
157
+ send_btn.click(
158
+ fn=send_request,
159
+ inputs=input_box,
160
+ outputs=output
161
+ )
162
+
163
+ input_box.submit(
164
+ fn=send_request,
165
+ inputs=input_box,
166
+ outputs=output
167
+ )
168
+
169
+ demo.launch()