Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| import json | |
| from jiwer import cer, wer | |
| import re | |
| pdf_file_path = 'dummy.pdf' | |
| with open("page_transcriptions.json", encoding="utf-8") as f: | |
| data = json.load(f) | |
| def send_request(url): | |
| try: | |
| with open(pdf_file_path, 'rb') as pdf_file: | |
| files = { | |
| 'file': ( | |
| pdf_file_path, | |
| pdf_file, | |
| 'application/pdf' | |
| ) | |
| } | |
| response = requests.post(url, files=files) | |
| except Exception as e: | |
| return {"Error message: "f"Error occurred while sending request. Error message: {e}"} | |
| try: | |
| response_json = response.json() | |
| except Exception as e: | |
| return { | |
| "Error message": e, | |
| "Response": response.content | |
| } | |
| if isinstance(response_json, list): | |
| for page in response_json: | |
| if isinstance(page, dict): | |
| if "page_number" not in page.keys() or "MD_text" not in page.keys(): | |
| return { | |
| "Error message": "Response is not in desired structure. Desired structure: [{'page_number': 1, 'MD_text': 'Extracted text'}]", | |
| "Response": response_json | |
| } | |
| if isinstance(page["page_number"], int) and isinstance(page["MD_text"], str): | |
| continue | |
| else: | |
| return { | |
| "Error message": "'page_number' should be integer and 'MD_text' should be string.", | |
| "Response": response_json | |
| } | |
| else: | |
| return { | |
| "Error message": "List should include only dictionaries.", | |
| "Response": response_json | |
| } | |
| if len(response_json) != len(data): | |
| return { | |
| "Error message": "The number of pages are not equal between transcription and ground truth.", | |
| "Response": response_json | |
| } | |
| final_metrics = [] | |
| total_reference = "" | |
| total_hypothesis = "" | |
| for page in response_json: | |
| for transcription in data: | |
| if page["page_number"] == transcription["page_number"]: | |
| reference = transcription['MD_text'].strip() | |
| hypothesis = page['MD_text'].strip() | |
| reference = reference.lower() | |
| hypothesis = hypothesis.lower() | |
| reference = reference.replace("\n", " ") | |
| hypothesis = hypothesis.replace("\n", " ") | |
| reference = re.sub(r'\s+', ' ', reference) | |
| hypothesis = re.sub(r'\s+', ' ', hypothesis) | |
| total_reference += reference | |
| total_reference += " " | |
| total_hypothesis += hypothesis | |
| total_hypothesis += " " | |
| cer_value = max(1 - cer(reference, hypothesis), 0) | |
| wer_value = max(1 - wer(reference, hypothesis), 0) | |
| final_metrics.append({"page_number": page["page_number"], "Character Success Rate (CSR)": round(cer_value, 4), "Word Success Rate (WSR)": round(wer_value, 4), "MD_text_used_for_metrics": hypothesis, "Ground_Truth_used_for_metrics": reference}) | |
| global_cer = max(1 - cer(total_reference.strip(), total_hypothesis.strip()), 0) | |
| global_wer = max(1 - wer(total_reference.strip(), total_hypothesis.strip()), 0) | |
| final_metrics.append({"Global CSR": global_cer, "Global WSR": global_wer, "MD_text_used_for_metrics": total_hypothesis.strip(), "Ground_Truth_used_for_metrics": total_reference.strip()}) | |
| return final_metrics | |
| else: | |
| return { | |
| "Error message": "Response should be list of dictionaries.", | |
| "Response": response_json | |
| } | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # OCR Endpoint Response Validator and Quality Checker | |
| Character Success Rate (CSR) and Word Success Rate (WSR) are metrics that will be provided for each page and total. | |
| They are calculated by simply subtracting CER and WER from 1 respectively. | |
| If CER or WER is > 1, CSR or WSR is considered as 0. | |
| Enter your endpoint below and click **Send** to get the result. | |
| Format: | |
| ```http://<host>/<endpoint>``` | |
| """ | |
| ) | |
| output = gr.JSON( | |
| label="Output" | |
| ) | |
| input_box = gr.Textbox( | |
| label="Input", | |
| lines=1, | |
| placeholder="Type your text here..." | |
| ) | |
| send_btn = gr.Button("Send") | |
| send_btn.click( | |
| fn=send_request, | |
| inputs=input_box, | |
| outputs=output | |
| ) | |
| input_box.submit( | |
| fn=send_request, | |
| inputs=input_box, | |
| outputs=output | |
| ) | |
| demo.launch() |