French-TV-Headline-Classification / inference_classification_transcript.py
Pclanglais's picture
Create inference_classification_transcript.py
1b0843f verified
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm.auto import tqdm
# Constants
batch_size = 1000
# Load tokenizer and model
model_checkpoint = "PleIAs/French-TV-Headline-Classification"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
# Read the dataset
val_classification = pd.read_parquet("[file]")
val_classification.reset_index(drop=True, inplace=True)
# Calculate the number of batches needed
num_batches = (len(val_classification) + batch_size - 1) // batch_size
# Initialize the list to collect DataFrames
list_df = []
for i in tqdm(range(num_batches), desc="Processing batches"):
start_index = i * batch_size
end_index = min((i + 1) * batch_size, len(val_classification))
batch = val_classification.iloc[start_index:end_index]
# Extract texts from the DataFrame
texts = batch["corrected_text"].tolist()
# Classify texts in batches
classifications = classification_pipeline(texts, truncation=True, padding=True, top_k=None)
# Prepare data for DataFrame
rows = []
for text_index, class_results in enumerate(classifications):
for entry in class_results:
rows.append({
'text_id': start_index + text_index,
'label': entry['label'],
'score': round(entry['score'] * 100, 2),
'identifier': batch.iloc[text_index]['identifier']
})
# Create DataFrame from the processed batch
df = pd.DataFrame(rows)
list_df.append(df)
# Concatenate all DataFrames in the list
final_df = pd.concat(list_df, ignore_index=True)
print(final_df)
# Save the resulting DataFrame to a CSV file
final_df.to_csv("transcript_classification.csv", index=False)