Spaces:
Sleeping
Sleeping
| from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import LLMChain | |
| from langchain.llms import OpenAI | |
| import requests | |
| import os | |
| import io | |
| from datasets import load_dataset | |
| import torch | |
| import soundfile as sf | |
| import gradio as gr | |
| from PIL import Image | |
| import numpy as np | |
| from dotenv import load_dotenv, find_dotenv | |
| load_dotenv(find_dotenv()) | |
| def handwriting_to_text(image): | |
| API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten" | |
| headers = {"Authorization": "Bearer "} | |
| with open(image, "rb") as f: | |
| data = f.read() | |
| response = requests.post(API_URL, headers=headers, data=data) | |
| return response.json() | |
| def generate_story(scenario): | |
| template = """ | |
| Consider yourself as the famous poet "William Shakespere"; | |
| You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length; | |
| CONTEXT: {scenario} | |
| POEM: | |
| """ | |
| prompt = PromptTemplate(template=template, input_variables=["scenario"]) | |
| story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True) | |
| story = story_llm.predict(scenario=scenario) | |
| print(story) | |
| return story | |
| def recite_the_poem(content): | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| inputs = processor(text=content, return_tensors="pt") | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
| sf.write("speech.wav", speech.numpy(), samplerate=16000) | |
| with open("speech.wav", "rb") as audio_file: | |
| audio_data = audio_file.read() | |
| return audio_data | |
| def main_model(image): | |
| image = Image.fromarray(np.uint8(image)) | |
| image_path = "temp_image.png" | |
| image.save(image_path) | |
| text = handwriting_to_text(image_path) | |
| poem = generate_story(text) | |
| audio_data = recite_the_poem(poem) | |
| return poem, audio_data | |
| iface = gr.Interface( | |
| fn=main_model, | |
| inputs="image", | |
| outputs=["text", "audio"], | |
| title="Flying Shakespeare", | |
| description="Upload the image generated from the Model:O101-M101/2", | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |