|
|
import pandas as pd |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
|
|
from tqdm.auto import tqdm |
|
|
|
|
|
|
|
|
batch_size = 1000 |
|
|
|
|
|
|
|
|
model_checkpoint = "PleIAs/French-TV-Headline-Classification" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512) |
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) |
|
|
classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer) |
|
|
|
|
|
|
|
|
val_classification = pd.read_parquet("[file]") |
|
|
val_classification.reset_index(drop=True, inplace=True) |
|
|
|
|
|
|
|
|
num_batches = (len(val_classification) + batch_size - 1) // batch_size |
|
|
|
|
|
|
|
|
list_df = [] |
|
|
|
|
|
for i in tqdm(range(num_batches), desc="Processing batches"): |
|
|
start_index = i * batch_size |
|
|
end_index = min((i + 1) * batch_size, len(val_classification)) |
|
|
batch = val_classification.iloc[start_index:end_index] |
|
|
|
|
|
|
|
|
texts = batch["corrected_text"].tolist() |
|
|
|
|
|
|
|
|
classifications = classification_pipeline(texts, truncation=True, padding=True, top_k=None) |
|
|
|
|
|
|
|
|
rows = [] |
|
|
for text_index, class_results in enumerate(classifications): |
|
|
for entry in class_results: |
|
|
rows.append({ |
|
|
'text_id': start_index + text_index, |
|
|
'label': entry['label'], |
|
|
'score': round(entry['score'] * 100, 2), |
|
|
'identifier': batch.iloc[text_index]['identifier'] |
|
|
}) |
|
|
|
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
list_df.append(df) |
|
|
|
|
|
|
|
|
|
|
|
final_df = pd.concat(list_df, ignore_index=True) |
|
|
|
|
|
print(final_df) |
|
|
|
|
|
|
|
|
final_df.to_csv("transcript_classification.csv", index=False) |
|
|
|