Spaces:
Runtime error
Runtime error
| # https://huggingface.co/spaces/rashisinghal/ai_speech_application | |
| # Here are the imports | |
| """ | |
| !pip install pymupdf | |
| !pip install git+https://github.com/huggingface/transformers.git | |
| !pip install datasets sentencepiece | |
| !pip install unidecode | |
| !pip install transformers | |
| !pip install gradio | |
| """ | |
| import gradio as gr | |
| import fitz | |
| import torch | |
| from unidecode import unidecode | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import soundfile as sf | |
| from IPython.display import Audio | |
| from datasets import load_dataset | |
| from transformers import pipeline | |
| from transformers import SpeechT5HifiGan | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
| # Here is the code | |
| def pdf_to_speech(pdf_path): | |
| # The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text. | |
| doc = fitz.open(pdf_path) | |
| # We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method. | |
| # The output is a list of tuple items, each item will look like this: | |
| # (x0, yo, x1, y1, "lines in the block", block_no, block_type) | |
| # Since our PDF is a multipage document we will using a loop to get the plain text from the document | |
| for page in doc: | |
| text = page.get_text() | |
| output = page.get_text("blocks") | |
| # ANALYZING THE TEXT TO EXTRACT ABSTRACT | |
| # A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text. | |
| # To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object. | |
| # The “block_dict” is a dictionary containing detailed information of all spans in a document. | |
| block_dict = {} | |
| page_num = 1 | |
| for page in doc: # Iterate all pages in the document | |
| file_dict = page.get_text('dict') # Get the page dictionary | |
| block = file_dict['blocks'] # Get the block information | |
| block_dict[page_num] = block # Store in block dictionary | |
| page_num += 1 # Increase the page value by 1 | |
| # In this we will retrieve the spans and store them in a DataFrame as follow: | |
| # The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line. | |
| # Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only. | |
| spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag']) | |
| rows = [] | |
| for page_num, blocks in block_dict.items(): | |
| for block in blocks: | |
| if block['type'] == 0: | |
| for line in block['lines']: | |
| for span in line['spans']: | |
| xmin, ymin, xmax, ymax = list(span['bbox']) | |
| font_size = span['size'] | |
| text = unidecode(span['text']) | |
| span_font = span['font'] | |
| is_upper = False | |
| is_bold = False | |
| if "bold" in span_font.lower(): | |
| is_bold = True | |
| if re.sub("[\(\[].*?[\)\]]", "", text).isupper(): | |
| is_upper = True | |
| if text.replace(" ","") != "": | |
| rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size)) | |
| span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size']) | |
| span_scores=[] | |
| span_num_occur={} | |
| special = '[(_:/,#%\=@)]' | |
| for index, span_row in span_df.iterrows(): | |
| score = round(span_row.font_size) | |
| text = span_row.text | |
| if not re.search(special, text): | |
| if span_row.is_bold: | |
| score +=1 | |
| if span_row.is_upper: | |
| score +=1 | |
| span_scores.append(score) | |
| values, counts = np.unique(span_scores, return_counts=True) | |
| # From this, we want to know the numer of unique text styles in the document, and the number of its occurrences. | |
| values, counts = np.unique(span_scores, return_counts=True) | |
| style_dict = {} | |
| for value, count in zip(values, counts): | |
| style_dict[value] = count | |
| sorted(style_dict.items(), key=lambda x: x[1]) | |
| # From this, we will be able to create a new column in our span dataframe for the tag information. | |
| # More the occurances means its a Paragraph and not the heading | |
| p_size = max(style_dict, key=style_dict.get) | |
| idx = 0 | |
| tag = {} | |
| for size in sorted(values, reverse = True): | |
| idx += 1 | |
| if size == p_size: | |
| idx = 0 | |
| tag[size] = 'p' | |
| if size > p_size: | |
| tag[size] = 'h{0}'.format(idx) | |
| if size < p_size: | |
| tag[size] = 's{0}'.format(idx) | |
| span_tags = [tag[score] for score in span_scores] | |
| span_df['tag'] = span_tags | |
| # We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information | |
| # since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings. | |
| # Thus we can easily extract information based on headings. | |
| headings_list = [] | |
| text_list = [] | |
| tmp = [] | |
| heading = '' | |
| for index, span_row in span_df.iterrows(): | |
| text = span_row.text | |
| tag = span_row.tag | |
| if 'h' in tag: | |
| headings_list.append(text) | |
| text_list.append('\n'.join(tmp)) | |
| tmp = [] | |
| heading = text | |
| else: | |
| tmp.append(text) | |
| text_list.append('\n'.join(tmp)) | |
| text_list = text_list[1:] | |
| text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] ) | |
| # Extracting the content of the column of the dataframe where the another column named heading is Abstract. | |
| # Basically, extracting the content of the paragraph abstract | |
| str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item() | |
| # Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text | |
| new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") | |
| summarized_text=new_summarized_pipeline(str_abstract) | |
| # Creating string from the list of dictionary | |
| str_summary = ",".join([item['summary_text'] for item in summarized_text]) | |
| # We tokenize the input with the processor. The input is the string that we generated of the summary | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| inputs = processor(text=str_summary, return_tensors="pt") | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
| spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| with torch.no_grad(): | |
| speech = vocoder(spectrogram) | |
| # Generating the speech of the summarized one liner Abstract | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
| sr=16000 | |
| return (sr,speech.numpy()) | |
| # Audio(speech, rate=16000) | |
| # Using Gradio Interface to specify the function name, inputs and outputs | |
| app = gr.Interface(fn=pdf_to_speech, | |
| inputs="file", | |
| outputs="audio", | |
| title="PDF Abstract to Audio Application", | |
| description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.", | |
| theme="soft") | |
| app.launch() |