elvin.v.mammadov commited on
Commit
abec581
·
1 Parent(s): ccbcb24

initial config

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. app.py +154 -0
  3. dummy.pdf +3 -0
  4. page_transcriptions.json +18 -0
  5. requirements.txt +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dummy.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import json
4
+ from jiwer import cer, wer
5
+
6
+ pdf_file_path = 'dummy.pdf'
7
+
8
+ with open("page_transcriptions.json", encoding="utf-8") as f:
9
+
10
+ data = json.load(f)
11
+
12
+ def send_request(url):
13
+
14
+ try:
15
+
16
+ with open(pdf_file_path, 'rb') as pdf_file:
17
+
18
+ files = {
19
+ 'file': (
20
+ 'dummy.pdf',
21
+ pdf_file,
22
+ 'application/pdf'
23
+ )
24
+ }
25
+
26
+ response = requests.post(url, files=files)
27
+
28
+ except Exception as e:
29
+
30
+ return {"Error message: "f"Error occurred while sending request. Error message: {e}"}
31
+
32
+ try:
33
+
34
+ response_json = response.json()
35
+
36
+ except Exception as e:
37
+
38
+ return {
39
+ "Error message": e,
40
+ "Response": response.content
41
+ }
42
+
43
+ if isinstance(response_json, list):
44
+
45
+ for page in response_json:
46
+
47
+ if isinstance(page, dict):
48
+
49
+ if "page_number" not in page.keys() or "MD_text" not in page.keys():
50
+
51
+ return {
52
+ "Error message": "Response is not in desired structure. Desired structure: [{'page_number': 1, 'MD_text': 'Extracted text'}]",
53
+ "Response": response_json
54
+ }
55
+
56
+ if isinstance(page["page_number"], int) and isinstance(page["MD_text"], str):
57
+
58
+ continue
59
+
60
+ else:
61
+
62
+ return {
63
+ "Error message": "'page_number' should be integer and 'MD_text' should be string.",
64
+ "Response": response_json
65
+ }
66
+
67
+ else:
68
+
69
+ return {
70
+ "Error message": "List should include only dictionaries.",
71
+ "Response": response_json
72
+ }
73
+
74
+ if len(response_json) != len(data):
75
+
76
+ return {
77
+ "Error message": "The number of pages are not equal between transcription and ground truth.",
78
+ "Response": response_json
79
+ }
80
+
81
+ final_metrics = []
82
+ total_reference = ""
83
+ total_hypothesis = ""
84
+
85
+ for page in response_json:
86
+
87
+ for transcription in data:
88
+
89
+ if page["page_number"] == transcription["page_number"]:
90
+
91
+ reference = transcription['MD_text'].strip()
92
+ hypothesis = page['MD_text'].strip()
93
+
94
+ total_reference += reference
95
+ total_reference += "\n"
96
+ total_hypothesis += hypothesis
97
+ total_hypothesis += "\n"
98
+
99
+ cer_value = cer(reference, hypothesis)
100
+ wer_value = wer(reference, hypothesis)
101
+
102
+ final_metrics.append({"page_number": page["page_number"], "Character Error Rate (CER)": round(cer_value, 4), "Word Error Rate (WER)": round(wer_value, 4), "MD_text": page['MD_text']})
103
+
104
+ global_cer = cer(total_reference.strip(), total_hypothesis.strip())
105
+ global_wer = wer(total_reference.strip(), total_hypothesis.strip())
106
+
107
+ final_metrics.append({"Global CER": global_cer, "Global WER": global_wer})
108
+
109
+ return final_metrics
110
+
111
+ else:
112
+
113
+ return {
114
+ "Error message": "Response should be list of dictionaries.",
115
+ "Response": response_json
116
+ }
117
+
118
+ with gr.Blocks() as demo:
119
+
120
+ # Output window (top)
121
+ # output = gr.Textbox(
122
+ # label="Output",
123
+ # lines=45,
124
+ # interactive=False
125
+ # )
126
+
127
+ output = gr.JSON(
128
+ label="Output"
129
+ )
130
+
131
+ # Input window (bottom)
132
+ input_box = gr.Textbox(
133
+ label="Input",
134
+ lines=1,
135
+ placeholder="Type your text here..."
136
+ )
137
+
138
+ send_btn = gr.Button("Send")
139
+
140
+ # Click handler
141
+ send_btn.click(
142
+ fn=send_request,
143
+ inputs=input_box,
144
+ outputs=output
145
+ )
146
+
147
+ # Allow pressing Enter to submit
148
+ input_box.submit(
149
+ fn=send_request,
150
+ inputs=input_box,
151
+ outputs=output
152
+ )
153
+
154
+ demo.launch()
dummy.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40c5b733d0e21c53a9e40fa6eb9df01bb6898b842f5f13bd42db5624c2a97dd0
3
+ size 49672
page_transcriptions.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "page_number": 1,
4
+ "MD_text": "Paragraph 1"
5
+ },
6
+ {
7
+ "page_number": 2,
8
+ "MD_text": "Paragraph 2"
9
+ },
10
+ {
11
+ "page_number": 3,
12
+ "MD_text": "Paragraph 3"
13
+ },
14
+ {
15
+ "page_number": 4,
16
+ "MD_text": "Paragraph 4"
17
+ }
18
+ ]
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ jiwer