File size: 8,010 Bytes
288ef5f
713e5ac
 
 
 
 
 
 
288ef5f
713e5ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import streamlit as st
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
import librosa
import tempfile
import os

# Configuração da página
st.set_page_config(page_title="Demo Multi-Modal AI", page_icon="🤖", layout="wide")

# -------- Cache de modelos --------
@st.cache_resource(show_spinner=False)
def load_model(model_key):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    cache_dir = "model_cache"
    os.makedirs(cache_dir, exist_ok=True)

    if model_key == 'sentiment_analysis':
        return pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", device=device, cache_dir=cache_dir)
    elif model_key == 'text_classification':
        return pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device=device, cache_dir=cache_dir)
    elif model_key == 'summarization':
        return pipeline("summarization", model="facebook/bart-large-cnn", device=device, max_length=150, min_length=30, cache_dir=cache_dir)
    elif model_key == 'question_answering':
        return pipeline("question-answering", model="deepset/roberta-base-squad2", device=device, cache_dir=cache_dir)
    elif model_key == 'translation':
        return pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-pt", device=device, cache_dir=cache_dir)
    elif model_key == 'text_generation':
        tokenizer = AutoTokenizer.from_pretrained("gpt2", cache_dir=cache_dir)
        model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir=cache_dir)
        model.config.pad_token_id = model.config.eos_token_id
        return pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
    elif model_key == 'ner':
        return pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=device, aggregation_strategy="simple", cache_dir=cache_dir)
    elif model_key == 'image_classification':
        return pipeline("image-classification", model="google/vit-base-patch16-224", device=device, cache_dir=cache_dir)
    elif model_key == 'object_detection':
        return pipeline("object-detection", model="facebook/detr-resnet-50", device=device, cache_dir=cache_dir)
    elif model_key == 'speech_to_text':
        return pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device, cache_dir=cache_dir)
    elif model_key == 'audio_classification':
        return pipeline("audio-classification", model="superb/hubert-base-superb-er", device=device, cache_dir=cache_dir)
    elif model_key == 'text_to_image':
        return StableDiffusionPipeline.from_pretrained(
            "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            use_safetensors=True, safety_checker=None, cache_dir=cache_dir
        )

# -------- Funções auxiliares --------
def process_audio_file(audio_file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_file.name)[1]) as tmp_file:
        tmp_file.write(audio_file.read())
        tmp_file_path = tmp_file.name
    audio_array, sr = librosa.load(tmp_file_path, sr=16000)
    os.unlink(tmp_file_path)
    return audio_array

def process_image_file(image_file):
    image = Image.open(image_file)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return image

def display_results(result, model_key, input_text=None):
    if model_key == 'summarization':
        st.subheader("📝 Resumo")
        if input_text:
            st.markdown("**Texto Original:**")
            st.write(input_text)
        st.info(result[0]['summary_text'])
    elif model_key == 'translation':
        st.subheader("🌍 Tradução")
        st.success(result[0]['translation_text'])
    elif model_key in ['sentiment_analysis', 'text_classification']:
        st.subheader("📊 Resultados")
        for res in result:
            st.write(f"- **{res['label']}**: {res['score']:.2%}")
    elif model_key == 'ner':
        st.subheader("🔍 Entidades Reconhecidas")
        for entity in result:
            st.write(f"- **{entity['word']}**: {entity['entity_group']} ({entity['score']:.2%})")
    elif model_key == 'text_generation':
        st.subheader("🧠 Texto Gerado")
        st.write(result[0]['generated_text'])
    elif model_key == 'image_classification':
        st.subheader("🏷️ Classificação de Imagem")
        for res in result[:5]:
            st.write(f"- **{res['label']}**: {res['score']:.2%}")
    elif model_key == 'object_detection':
        st.subheader("📦 Objetos Detectados")
        for obj in result:
            st.write(f"- {obj['label']} ({obj['score']:.2%})")
    elif model_key == 'speech_to_text':
        st.subheader("🔈 Transcrição de Áudio")
        st.success(result['text'])
    elif model_key == 'audio_classification':
        st.subheader("🎧 Classificação de Áudio")
        top_emotion = result[0]
        st.write(f"**Emoção detectada**: {top_emotion['label']} ({top_emotion['score']:.2%})")
    elif model_key == 'text_to_image':
        st.subheader("🎨 Imagem Gerada")
        st.image(result[0], caption="Imagem gerada a partir do texto")

# -------- Casos de uso --------
use_cases = {
    'sentiment_analysis': "A entrega foi super rápida, adorei!",
    'text_classification': "Estou insatisfeito com o produto",
    'summarization': "A empresa XYZ reportou um crescimento de 15% no último trimestre...",
    'question_answering': {
        'context': "O produto X tem garantia de 2 anos e pode ser configurado via app em 5 minutos.",
        'question': "Qual é o tempo de garantia do produto X?"
    },
    'translation': "Our product ensures high performance",
    'ner': "Microsoft assinou um contrato com a empresa XYZ em Nova York.",
    'text_generation': "Era uma vez um robô que",
    'speech_to_text': None,
    'audio_classification': None,
    'image_classification': None,
    'object_detection': None,
    'text_to_image': "Um carro futurista voando sobre Lisboa"
}

# -------- Interface --------
st.title("🤖 Demo Multi-Modal AI")
model_key = st.selectbox("Escolha o modelo para testar:", list(use_cases.keys()))
model = load_model(model_key)

if model_key in ['sentiment_analysis', 'text_classification', 'summarization', 'translation', 'text_generation', 'ner']:
    input_text = st.text_area("Insira texto:", value=use_cases[model_key] if isinstance(use_cases[model_key], str) else "")
    if st.button("Executar"):
        if model_key == 'question_answering':
            result = model(question=use_cases['question_answering']['question'], context=use_cases['question_answering']['context'])
        else:
            result = model(input_text)
        display_results(result, model_key, input_text=input_text)

elif model_key in ['speech_to_text', 'audio_classification']:
    audio_file = st.file_uploader("Carregue um arquivo de áudio", type=['wav','mp3','flac','m4a'])
    if audio_file and st.button("Executar"):
        audio_data = process_audio_file(audio_file)
        result = model(audio_file)
        display_results(result, model_key)

elif model_key in ['image_classification', 'object_detection', 'text_to_image']:
    uploaded_file = st.file_uploader("Carregue uma imagem (ou deixe vazio para gerar)", type=['jpg','jpeg','png'])
    prompt = st.text_input("Prompt para gerar imagem (apenas text_to_image):", value=use_cases['text_to_image'] if model_key=='text_to_image' else "")
    if st.button("Executar"):
        if model_key == 'text_to_image':
            result = [model(prompt).images[0]]
        elif uploaded_file:
            image = process_image_file(uploaded_file)
            result = model(image)
        else:
            st.warning("Carregue uma imagem ou insira prompt para gerar.")
            result = None
        if result:
            display_results(result, model_key)