Spaces:

AILabTUL
/

APCR_BART

Sleeping

App Files Files Community

mpolacek commited on Dec 30, 2024

Commit

83479b5

verified ·

1 Parent(s): e51fb3b

Upload 2 files

Browse files

Files changed (2) hide show

app.py +84 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import streamlit as st
+from transformers import BartForConditionalGeneration, DebertaV2Tokenizer
+import torch
+import time
+from huggingface_hub import Repository
+repo = Repository(
+    local_dir="scripts",
+    repo_type="model",
+    clone_from="AILabTUL/APCR_BART",
+    token=True
+)
+repo.git_pull()
+# Nastavení stránky
+st.set_page_config(page_title="Text Punctuation and Capitalization Restoration", layout="wide")
+# Načtení modelu a tokenizeru do cache
+@st.cache_resource
+def load_model():
+    tokenizer = DebertaV2Tokenizer.from_pretrained("./scripts")
+    model = BartForConditionalGeneration.from_pretrained('./scripts')
+    model.load_state_dict(torch.load("./scripts/pytorch_model.bin", map_location=torch.device('cpu')))
+    model.eval()  # Přepnutí modelu do eval režimu
+    return model, tokenizer
+model, tokenizer = load_model()
+# Titulek aplikace
+st.title("Obnova interpunkce a velkých písmen v textu")
+# Vstupní formulář pro uživatele
+with st.form(key='input_form'):
+    input_text = st.text_area("Zadejte text bez interpunkce a velkých písmen:",
+                              value="Co jde podat Sněmovny už je Sněmovně Ve zrychleném čtení chceme schválit změnu zákoníku práce která by měla platit od 1. ledna",
+                              height=150)
+    submit_button = st.form_submit_button(label='Generovat')
+    input_text = input_text.replace("\n", " ").replace(".", " ").replace(",", " ").replace("?", " ").replace("!", " ").lower()
+if submit_button:
+    if not input_text.strip():
+        st.error("Prosím, zadejte nějaký text.")
+    else:
+        # Tokenizace vstupního textu
+        input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+        eos_token_id = 32001
+        max_length = 50
+        generated_ids = torch.tensor([[model.config.decoder_start_token_id]])
+        output_placeholder = st.empty()
+        for _ in range(max_length):
+            # Forward průchod
+            outputs = model(
+                input_ids=input_ids,
+                decoder_input_ids=generated_ids
+            )
+            # Extrakce logits posledního tokenu
+            next_token_logits = outputs.logits[:, -1, :]
+            # Sampling nebo argmax pro výběr dalšího tokenu
+            next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
+            # Přidání tokenu do generované sekvence
+            generated_ids = torch.cat([generated_ids, next_token_id], dim=1)
+            # Ukončení generace při dosažení EOS tokenu
+            if next_token_id.item() == eos_token_id:
+                break
+            # Malá prodleva pro viditelné generování (můžete upravit podle potřeby)
+            #time.sleep(0.3)
+            # Tokeny na text
+            generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+            output_placeholder.text(generated_text)
+        st.success("Generování dokončeno!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+tqdm
+numpy
+sentencepiece
+transformers
+scikit-learn
+huggingface_hub