Spaces:
Runtime error
Runtime error
| import json | |
| import time | |
| from json.decoder import JSONDecodeError | |
| from typing import Union | |
| from .hallucinations import KNOWN_HALLUCINATIONS | |
| ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] | |
| class Transcript: | |
| """ | |
| Class for storing transcript data, including speaker information and text segments, | |
| and exporting it to various file formats such as JSON, HTML, and LaTeX. | |
| """ | |
| def __init__(self, transcript: dict) -> None: | |
| """ | |
| Initializes the Transcript object with the given transcript data. | |
| Args: | |
| transcript (dict): A dictionary containing the formatted transcript string. | |
| Keys should correspond to segment IDs, and values should | |
| contain speaker and segment information. | |
| """ | |
| self.transcript = transcript | |
| self._remove_hallucinations() | |
| self.speakers = self._extract_speakers() | |
| self.segments = self._extract_segments() | |
| self.annotation = {} | |
| def annotate(self, *args, **kwargs) -> dict: | |
| """ | |
| Annotates the transcript to associate specific names with speakers. | |
| Args: | |
| args (list): List of speaker names. These will be mapped sequentially to the speakers. | |
| kwargs (dict): Dictionary with speaker names as keys and list of segments as values. | |
| Returns: | |
| dict: Dictionary with speaker names as keys and list of segments as values. | |
| Raises: | |
| ValueError: If the number of speaker names does not match the number | |
| of speakers, or if an unknown speaker is found. | |
| """ | |
| annotations = {} | |
| if args and len(args) != len(self.speakers): | |
| raise ValueError( | |
| "Number of speaker names does not match number of speakers") | |
| if args: | |
| for arg, speaker in zip(args, sorted(self.speakers)): | |
| annotations[speaker] = arg | |
| invalid_speakers = set(kwargs.keys()) - set(self.speakers) | |
| if invalid_speakers: | |
| raise ValueError( | |
| f"These keys are not speakers: {', '.join(invalid_speakers)}") | |
| annotations.update({key: kwargs[key] | |
| for key in self.speakers if key in kwargs}) | |
| self.annotation = annotations | |
| return self | |
| def _remove_hallucinations(self) -> None: | |
| """ | |
| Removes all occurances of known hallucinations from all segments of the transcript. | |
| Segments that are identical to empty strings afterwards are removed from the transcript. | |
| """ | |
| segments_to_drop = [] | |
| for id in self.transcript: | |
| for snippet in KNOWN_HALLUCINATIONS: | |
| self.transcript[id]['text'] = self.transcript[id]['text'].replace( | |
| snippet, '') | |
| if self.transcript[id]['text'] == '': | |
| segments_to_drop.append(id) | |
| for id in segments_to_drop: | |
| del self.transcript[id] | |
| def _extract_speakers(self) -> list: | |
| """ | |
| Extracts the unique speaker names from the transcript. | |
| Returns: | |
| list: List of unique speaker names in the transcript. | |
| """ | |
| return list(set([self.transcript[id]["speakers"] for id in self.transcript])) | |
| def _extract_segments(self) -> list: | |
| """ | |
| Extracts all the text segments from the transcript. | |
| Returns: | |
| list: List of segments, where each segment is represented | |
| by the starting and ending times. | |
| """ | |
| return [self.transcript[id]["segments"] for id in self.transcript] | |
| def __str__(self) -> str: | |
| """ | |
| Converts the transcript to a string representation. | |
| Returns: | |
| str: String representation of the transcript, including speaker names and | |
| time stamps for each segment. | |
| """ | |
| fstring = "" | |
| for _id in self.transcript: | |
| seq = self.transcript[_id] | |
| if self.annotation: | |
| speaker = self.annotation[seq["speakers"]] | |
| else: | |
| speaker = seq["speakers"] | |
| segm = seq["segments"] | |
| sseg = time.strftime("%H:%M:%S", time.gmtime(segm[0])) | |
| eseg = time.strftime("%H:%M:%S", time.gmtime(segm[1])) | |
| fstring += f"{speaker} ({sseg} ; {eseg}):\t{seq['text']}\n" | |
| return fstring | |
| def __repr__(self) -> str: | |
| """Return a string representation of the Transcript object. | |
| Returns: | |
| str: A string that provides an informative description of the object. | |
| """ | |
| return f"Transcript(speakers = {self.speakers},"\ | |
| f"segments = {self.segments}, annotation = {self.annotation})" | |
| def get_dict(self) -> dict: | |
| """ | |
| Get transcript as dict | |
| :return: transcript as dict | |
| :rtype: dict | |
| """ | |
| return self.transcript | |
| def get_json(self, *args, use_annotation: bool = True, **kwargs) -> str: | |
| """ | |
| Get transcript as json string | |
| :return: transcript as json string | |
| :rtype: str | |
| """ | |
| if "indent" not in kwargs: | |
| kwargs["indent"] = 3 | |
| if use_annotation and self.annotation: | |
| for _id in self.transcript: | |
| seq = self.transcript[_id] | |
| seq["speakers"] = self.annotation[seq["speakers"]] | |
| return json.dumps(self.transcript, *args, **kwargs) | |
| def get_html(self) -> str: | |
| """ | |
| Get transcript as html string | |
| :return: transcript as html string | |
| :rtype: str | |
| """ | |
| html = "<p>" + self.__str__().replace("\n", "<br>") + "</p>" | |
| html = "<html><body>" + html + "</body></html>" | |
| html = html.replace("\t", " ") | |
| return html | |
| def get_md(self) -> str: | |
| """Get transcript as Markdown string, using HTML formatting. | |
| Returns: | |
| str: Transcript as a Markdown string. | |
| """ | |
| return self.get_html() | |
| def get_tex(self) -> str: | |
| """Get transcript as LaTeX string. If no annotations are present, the speakers will | |
| be annotated with the first letters of the alphabet. | |
| Returns: | |
| str: Transcript as LaTeX string. | |
| """ | |
| if not self.annotation: | |
| self.annotate(*ALPHABET[:len(self.speakers)]) | |
| fstring = "\\begin{drama}" | |
| for speaker in self.speakers: | |
| fstring += "\n\t\\Character{" + str(self.annotation[speaker]) + "}" \ | |
| "{" + str(self.annotation[speaker]) + "}" | |
| for id in self.transcript: | |
| seq = self.transcript[id] | |
| speaker = self.annotation[seq["speakers"]] | |
| fstring += f"\n\\{speaker}speaks:\n{seq['text']}" | |
| fstring += "\n\\end{drama}" | |
| return fstring | |
| def to_json(self, path, *args, **kwargs) -> None: | |
| """Save transcript as json file | |
| Args: | |
| path (str): path to save file | |
| """ | |
| with open(path, "w") as f: | |
| json.dump(self.transcript, f, *args, **kwargs) | |
| def to_txt(self, path: str) -> None: | |
| """Save transcript as a LaTeX file (placeholder function, implementation needed). | |
| Args: | |
| path (str): Path to save the LaTeX file. | |
| """ | |
| with open(path, "w") as f: | |
| f.write(self.__str__()) | |
| def to_md(self, path: str) -> None: | |
| """Get transcript as Markdown string, using HTML formatting. | |
| Returns: | |
| str: Transcript as a Markdown string. | |
| """ | |
| return self.to_html(path) | |
| def to_html(self, path: str) -> None: | |
| """ | |
| Save transcript as html file | |
| :param path: path to save file | |
| :type path: str | |
| """ | |
| with open(path, "w") as file: | |
| file.write(self.get_html()) | |
| def to_tex(self, path: str) -> None: | |
| """Save transcript as a LaTeX file (placeholder function, implementation needed). | |
| Args: | |
| path (str): Path to save the LaTeX file. | |
| """ | |
| pass | |
| def to_pdf(self, path: str) -> None: | |
| """Save transcript as a PDF file (placeholder function, implementation needed). | |
| Args: | |
| path (str): Path to save the PDF file. | |
| """ | |
| pass | |
| def save(self, path: str, *args, **kwargs) -> None: | |
| """Save transcript to file with the given path and file format. | |
| This method can save the transcript in various formats including JSON, TXT, | |
| MD, HTML, TEX, and PDF. The file format is determined by the extension of | |
| the path. | |
| Args: | |
| path (str): Path to save the file, including the desired file extension. | |
| *args: Additional positional arguments to be passed to the specific save methods. | |
| **kwargs: Additional keyword arguments to be passed to the specific save methods. | |
| Raises: | |
| ValueError: If the file format specified in the path is unknown. | |
| """ | |
| if path.endswith(".json"): | |
| self.to_json(path, *args, **kwargs) | |
| elif path.endswith(".txt"): | |
| self.to_txt(path, *args, **kwargs) | |
| elif path.endswith(".md"): | |
| self.to_md(path, *args, **kwargs) | |
| elif path.endswith(".html"): | |
| self.to_html(path, *args, **kwargs) | |
| elif path.endswith(".tex"): | |
| self.to_tex(path, *args, **kwargs) | |
| elif path.endswith(".pdf"): | |
| self.to_pdf(path, *args, **kwargs) | |
| else: | |
| raise ValueError("Unknown file format") | |
| def from_json(cls, _json: Union[dict, str]) -> "Transcript": | |
| """Load transcript from json file | |
| Args: | |
| path (str): path to json file | |
| Returns: | |
| Transcript: Transcript object | |
| """ | |
| if isinstance(_json, dict): | |
| return cls(_json) | |
| else: | |
| try: | |
| transcript = json.loads(_json) | |
| except (TypeError, JSONDecodeError): | |
| with open(_json, "r") as f: | |
| transcript = json.load(f) | |
| return cls(transcript) | |