File size: 4,386 Bytes
7204409
 
 
 
 
 
 
 
5bcb28b
7204409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0748331
 
7204409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
import json
import os
import urllib.parse
from pathlib import Path
import pandas as pd
from loguru import logger
from .abstracts import AbstractProcessor
from .s3 import S3Client  

class Processor(AbstractProcessor):
    def get_audio_paths(self, folder: str) -> list[str]:
        def extract_number(file_path: str) -> int:
            match = re.search(r"segment_(\d+)", file_path)
            return int(match.group(1)) if match else float("inf")
        audio_paths = list(Path(folder).glob("*.mp3"))
        audio_paths = [audio_path.as_posix() for audio_path in audio_paths]
        audio_paths = sorted(audio_paths, key=extract_number)
        return audio_paths[3:]

    def process_text(self, text: str) -> str:
        text = re.sub(r"\+\s*\.", ".", text)
        text = re.sub(r"\*\s*\+\s*;", ";", text)
        text = re.sub(r"\*\s*\+", "", text)
        text = text.replace(" + ", " ").replace(" * ", " ").replace("+", " ")
        text = re.sub(r'["“”]', "", text)
        return text.strip()

    def splitter(self, text: str) -> list[str]:
        return re.split(r"[,:;.]", self.process_text(text))

    def flatten_nested_values(self, nested_values: pd.Series) -> list[str]:
        flattened = []
        for group in nested_values:
            for item in group:
                cleaned_item = re.sub(r"^\d+\s*", "", item).strip()
                if cleaned_item:
                    flattened.append(cleaned_item)
        return flattened

    def load_persistent_data(self, file: str) -> list:
        if os.path.exists(file):
            with open(file, "r", encoding="utf-8") as f:
                return json.load(f)
        return []

    def save_persistent_data(self, data: list, file: str) -> None:
        with open(file, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

    def extract_audio_identifier(self, url: str):
        parts = url.strip("/").split("/")
        return urllib.parse.unquote(parts[-2]), int(parts[-1])

    def find_and_return_after_last(self, long_list: list, short_list: list) -> list:
        last_index = -1
        for i, item in enumerate(long_list):
            if item in short_list:
                last_index = i
        return long_list[last_index+1:] if last_index != -1 else long_list

    def load_page_verses_and_audios(self, s3_client, page: str, df_verses: pd.DataFrame) -> tuple[list[str], list[str]]:
        audio_paths = self.get_audio_paths(page)
        page_fixed = page.replace("/", "\\")
        _, chapter, page_str = page_fixed.split("\\")
        s3_key = f"labelling/{chapter}/{page_str}/results.json"
        page_int = int(page_str.replace("page_", ""))
        tmp = df_verses[(df_verses.chapter == chapter) & (df_verses.page == page_int)]
        possible_values = tmp["moore_verse_text"].apply(self.splitter)
        possible_values = self.flatten_nested_values(possible_values)
        try:
            s3_client.download_file("results.json",s3_key)
            transcriptions = self.load_persistent_data("results.json")
            latest_transcription = transcriptions[-1].get("transcriptions")
            latest_audio = [transcriptions[-1].get("segment_path")]
            audio_paths = self.find_and_return_after_last(audio_paths, latest_audio)
            possible_values = self.find_and_return_after_last(possible_values, latest_transcription)
            logger.info(f"Latest transcription: {latest_audio} / {latest_transcription}")
            return possible_values, audio_paths
        except Exception as e:
            logger.error(f"An error occurred: {e}")
            return possible_values, audio_paths

    def get_contribution_data(self, s3_client) -> pd.DataFrame:
        files = s3_client.list_files("labelling")
        files = [file for file in files if file.endswith("json")]
        try:
            df = s3_client.load_json_files(files=files, unique_columns=["segment_path", "user_id"])
            df[["tmp1", "chapter", "page", "segment"]] = df.segment_path.str.split("/", expand=True)
            return (
                df.sort_values(["chapter", "page"]).drop(columns=["tmp1", "segment_path"])
                if not df.empty
                else pd.DataFrame()
            )
        except Exception as e:
            logger.error(f"Error in get_contribution_data: {e}")
            return pd.DataFrame()