ParsVoice: A Large-Scale Multi-Speaker Persian Speech Corpus for Text-to-Speech Synthesis
Paper • 2510.10774 • Published • 3
A BERT-based classifier that determines whether a Persian sentence is Complete or Incomplete.
Designed for use in ASR post-processing pipelines (e.g. after speech-to-text).
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
MODEL_ID = "MohammadJRanjbar/persian-sentence-completion"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
model.eval()
label_map = {0: "Incomplete", 1: "Complete"}
def classify_sentences(sentences, batch_size=16):
results = []
for i in range(0, len(sentences), batch_size):
batch = sentences[i:i+batch_size]
inputs = tokenizer(batch, return_tensors="pt", padding=True,
truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
preds = torch.argmax(outputs.logits, dim=-1)
results.extend([label_map[p.item()] for p in preds])
return results
# Example
texts = ["امروز هوا خیلی عالی", "امروز هوا خیلی عالی است."]
print(classify_sentences(texts))
# → ['Incomplete', 'Complete']
If you use this model, please cite the following works:
@inproceedings{ranjbar-kalahroodi-etal-2026-persianpunc,
title = "{P}ersian{P}unc: A Large-Scale Dataset and {BERT}-Based Approach for {P}ersian Punctuation Restoration",
author = "Ranjbar Kalahroodi, Mohammad Javad and
Faili, Heshaam and
Shakery, Azadeh",
editor = "Merchant, Rayyan and
Megerdoomian, Karine",
booktitle = "The Proceedings of the First Workshop on {NLP} and {LLM}s for the {I}ranian Language Family",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.silkroadnlp-1.11/",
doi = "10.18653/v1/2026.silkroadnlp-1.11",
pages = "105--113",
ISBN = "979-8-89176-371-5",
}
@misc{kalahroodi2025parsvoicelargescalemultispeakerpersian,
title = {ParsVoice: A Large-Scale Multi-Speaker Persian Speech Corpus for Text-to-Speech Synthesis},
author = {Mohammad Javad Ranjbar Kalahroodi and Heshaam Faili and Azadeh Shakery},
year = {2025},
eprint = {2510.10774},
archivePrefix = {arXiv},
primaryClass = {cs.SD},
url = {https://arxiv.org/abs/2510.10774},
}