Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -97,14 +97,72 @@ def fine_tune_model():
|
|
| 97 |
|
| 98 |
st.session_state.model.eval()
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# Initialize session state
|
| 101 |
if 'model_loaded' not in st.session_state:
|
| 102 |
st.session_state.tokenizer = T5Tokenizer.from_pretrained("t5-base")
|
| 103 |
st.session_state.model = T5ForConditionalGeneration.from_pretrained("t5-base")
|
| 104 |
st.session_state.model_loaded = True
|
| 105 |
-
|
| 106 |
-
# Rest of your existing functions (clean_generated_text and humanize_text remain the same)
|
| 107 |
-
[Previous clean_generated_text and humanize_text functions remain unchanged]
|
| 108 |
|
| 109 |
# UI Components
|
| 110 |
st.set_page_config(page_title="Advanced Text Humanizer", page_icon="🤖")
|
|
|
|
| 97 |
|
| 98 |
st.session_state.model.eval()
|
| 99 |
|
| 100 |
+
def clean_generated_text(text):
|
| 101 |
+
"""Remove comandos e limpa o texto gerado"""
|
| 102 |
+
text = text.strip()
|
| 103 |
+
|
| 104 |
+
# Lista de prefixos de comando para remover
|
| 105 |
+
prefixes = [
|
| 106 |
+
"reescreva o seguinte texto",
|
| 107 |
+
"reescreva este texto",
|
| 108 |
+
"reescreva o texto",
|
| 109 |
+
"traduza",
|
| 110 |
+
"humanize:",
|
| 111 |
+
"humanizar:",
|
| 112 |
+
"em português",
|
| 113 |
+
"de forma mais natural"
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
# Remove os prefixos de comando
|
| 117 |
+
text_lower = text.lower()
|
| 118 |
+
for prefix in prefixes:
|
| 119 |
+
if text_lower.startswith(prefix):
|
| 120 |
+
text = text[len(prefix):].strip()
|
| 121 |
+
text_lower = text.lower()
|
| 122 |
+
|
| 123 |
+
# Capitaliza a primeira letra
|
| 124 |
+
if text:
|
| 125 |
+
text = text[0].upper() + text[1:]
|
| 126 |
+
|
| 127 |
+
return text
|
| 128 |
+
|
| 129 |
+
def humanize_text(text):
|
| 130 |
+
"""Humaniza o texto mantendo coerência e tamanho"""
|
| 131 |
+
prompt = f"reescreva em português natural, mantendo todas as informações: {text}"
|
| 132 |
+
|
| 133 |
+
input_ids = st.session_state.tokenizer(
|
| 134 |
+
prompt,
|
| 135 |
+
return_tensors="pt",
|
| 136 |
+
max_length=1024,
|
| 137 |
+
truncation=True
|
| 138 |
+
).input_ids
|
| 139 |
+
|
| 140 |
+
# Parâmetros ajustados para melhor coerência
|
| 141 |
+
outputs = st.session_state.model.generate(
|
| 142 |
+
input_ids,
|
| 143 |
+
max_length=1024,
|
| 144 |
+
min_length=len(text.split()),
|
| 145 |
+
do_sample=True,
|
| 146 |
+
temperature=0.1,
|
| 147 |
+
top_p=0.95,
|
| 148 |
+
num_beams=3,
|
| 149 |
+
repetition_penalty=1.2,
|
| 150 |
+
length_penalty=2.0
|
| 151 |
+
)
|
| 152 |
+
result = st.session_state.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 153 |
+
result = clean_generated_text(result)
|
| 154 |
+
|
| 155 |
+
# Garante tamanho mínimo
|
| 156 |
+
while len(result.split()) < len(text.split()):
|
| 157 |
+
result += " " + " ".join(text.split()[-(len(text.split()) - len(result.split())):])
|
| 158 |
+
|
| 159 |
+
return result
|
| 160 |
+
|
| 161 |
# Initialize session state
|
| 162 |
if 'model_loaded' not in st.session_state:
|
| 163 |
st.session_state.tokenizer = T5Tokenizer.from_pretrained("t5-base")
|
| 164 |
st.session_state.model = T5ForConditionalGeneration.from_pretrained("t5-base")
|
| 165 |
st.session_state.model_loaded = True
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
# UI Components
|
| 168 |
st.set_page_config(page_title="Advanced Text Humanizer", page_icon="🤖")
|