Spaces:
Running
Running
File size: 8,010 Bytes
288ef5f 713e5ac 288ef5f 713e5ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | import streamlit as st
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
import librosa
import tempfile
import os
# Configuração da página
st.set_page_config(page_title="Demo Multi-Modal AI", page_icon="🤖", layout="wide")
# -------- Cache de modelos --------
@st.cache_resource(show_spinner=False)
def load_model(model_key):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache_dir = "model_cache"
os.makedirs(cache_dir, exist_ok=True)
if model_key == 'sentiment_analysis':
return pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", device=device, cache_dir=cache_dir)
elif model_key == 'text_classification':
return pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device=device, cache_dir=cache_dir)
elif model_key == 'summarization':
return pipeline("summarization", model="facebook/bart-large-cnn", device=device, max_length=150, min_length=30, cache_dir=cache_dir)
elif model_key == 'question_answering':
return pipeline("question-answering", model="deepset/roberta-base-squad2", device=device, cache_dir=cache_dir)
elif model_key == 'translation':
return pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-pt", device=device, cache_dir=cache_dir)
elif model_key == 'text_generation':
tokenizer = AutoTokenizer.from_pretrained("gpt2", cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir=cache_dir)
model.config.pad_token_id = model.config.eos_token_id
return pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
elif model_key == 'ner':
return pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=device, aggregation_strategy="simple", cache_dir=cache_dir)
elif model_key == 'image_classification':
return pipeline("image-classification", model="google/vit-base-patch16-224", device=device, cache_dir=cache_dir)
elif model_key == 'object_detection':
return pipeline("object-detection", model="facebook/detr-resnet-50", device=device, cache_dir=cache_dir)
elif model_key == 'speech_to_text':
return pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device, cache_dir=cache_dir)
elif model_key == 'audio_classification':
return pipeline("audio-classification", model="superb/hubert-base-superb-er", device=device, cache_dir=cache_dir)
elif model_key == 'text_to_image':
return StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
use_safetensors=True, safety_checker=None, cache_dir=cache_dir
)
# -------- Funções auxiliares --------
def process_audio_file(audio_file):
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_file.name)[1]) as tmp_file:
tmp_file.write(audio_file.read())
tmp_file_path = tmp_file.name
audio_array, sr = librosa.load(tmp_file_path, sr=16000)
os.unlink(tmp_file_path)
return audio_array
def process_image_file(image_file):
image = Image.open(image_file)
if image.mode != 'RGB':
image = image.convert('RGB')
return image
def display_results(result, model_key, input_text=None):
if model_key == 'summarization':
st.subheader("📝 Resumo")
if input_text:
st.markdown("**Texto Original:**")
st.write(input_text)
st.info(result[0]['summary_text'])
elif model_key == 'translation':
st.subheader("🌍 Tradução")
st.success(result[0]['translation_text'])
elif model_key in ['sentiment_analysis', 'text_classification']:
st.subheader("📊 Resultados")
for res in result:
st.write(f"- **{res['label']}**: {res['score']:.2%}")
elif model_key == 'ner':
st.subheader("🔍 Entidades Reconhecidas")
for entity in result:
st.write(f"- **{entity['word']}**: {entity['entity_group']} ({entity['score']:.2%})")
elif model_key == 'text_generation':
st.subheader("🧠 Texto Gerado")
st.write(result[0]['generated_text'])
elif model_key == 'image_classification':
st.subheader("🏷️ Classificação de Imagem")
for res in result[:5]:
st.write(f"- **{res['label']}**: {res['score']:.2%}")
elif model_key == 'object_detection':
st.subheader("📦 Objetos Detectados")
for obj in result:
st.write(f"- {obj['label']} ({obj['score']:.2%})")
elif model_key == 'speech_to_text':
st.subheader("🔈 Transcrição de Áudio")
st.success(result['text'])
elif model_key == 'audio_classification':
st.subheader("🎧 Classificação de Áudio")
top_emotion = result[0]
st.write(f"**Emoção detectada**: {top_emotion['label']} ({top_emotion['score']:.2%})")
elif model_key == 'text_to_image':
st.subheader("🎨 Imagem Gerada")
st.image(result[0], caption="Imagem gerada a partir do texto")
# -------- Casos de uso --------
use_cases = {
'sentiment_analysis': "A entrega foi super rápida, adorei!",
'text_classification': "Estou insatisfeito com o produto",
'summarization': "A empresa XYZ reportou um crescimento de 15% no último trimestre...",
'question_answering': {
'context': "O produto X tem garantia de 2 anos e pode ser configurado via app em 5 minutos.",
'question': "Qual é o tempo de garantia do produto X?"
},
'translation': "Our product ensures high performance",
'ner': "Microsoft assinou um contrato com a empresa XYZ em Nova York.",
'text_generation': "Era uma vez um robô que",
'speech_to_text': None,
'audio_classification': None,
'image_classification': None,
'object_detection': None,
'text_to_image': "Um carro futurista voando sobre Lisboa"
}
# -------- Interface --------
st.title("🤖 Demo Multi-Modal AI")
model_key = st.selectbox("Escolha o modelo para testar:", list(use_cases.keys()))
model = load_model(model_key)
if model_key in ['sentiment_analysis', 'text_classification', 'summarization', 'translation', 'text_generation', 'ner']:
input_text = st.text_area("Insira texto:", value=use_cases[model_key] if isinstance(use_cases[model_key], str) else "")
if st.button("Executar"):
if model_key == 'question_answering':
result = model(question=use_cases['question_answering']['question'], context=use_cases['question_answering']['context'])
else:
result = model(input_text)
display_results(result, model_key, input_text=input_text)
elif model_key in ['speech_to_text', 'audio_classification']:
audio_file = st.file_uploader("Carregue um arquivo de áudio", type=['wav','mp3','flac','m4a'])
if audio_file and st.button("Executar"):
audio_data = process_audio_file(audio_file)
result = model(audio_file)
display_results(result, model_key)
elif model_key in ['image_classification', 'object_detection', 'text_to_image']:
uploaded_file = st.file_uploader("Carregue uma imagem (ou deixe vazio para gerar)", type=['jpg','jpeg','png'])
prompt = st.text_input("Prompt para gerar imagem (apenas text_to_image):", value=use_cases['text_to_image'] if model_key=='text_to_image' else "")
if st.button("Executar"):
if model_key == 'text_to_image':
result = [model(prompt).images[0]]
elif uploaded_file:
image = process_image_file(uploaded_file)
result = model(image)
else:
st.warning("Carregue uma imagem ou insira prompt para gerar.")
result = None
if result:
display_results(result, model_key) |