TestTraining / train.py
Noor22Tak's picture
Create train.py
f7678e6 verified
import pandas as pd
import faiss
import requests
import numpy as np
# Load dataset
df = pd.read_csv('/app/news_dataset.csv')
# Function to create textual representation
def create_textual_representation(row):
return f"""
الكاتب: {row['writer']},
الموقع: {row['location']},
التاريخ: {row['date']},
الوقت: {row['time']},
العنوان: {row['title']},
الخبر: {row['news']}
"""
df['textual_representation'] = df.apply(create_textual_representation, axis=1)
# FAISS setup
dim = 4096
index = faiss.IndexFlatL2(dim)
x = np.zeros((len(df), dim), dtype='float32')
# Generate embeddings using Llama 3.1 on Hugging Face API
HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B"
HEADERS = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
for i, text in enumerate(df['textual_representation']):
print(f'Processing {i} instance')
response = requests.post(HF_API_URL, headers=HEADERS, json={"inputs": text})
embedding = response.json()[0]['embedding']
x[i] = np.array(embedding)
# Add embeddings to FAISS index
index.add(x)
# Save FAISS index
faiss.write_index(index, "/app/arabic_news_index")