TestTraining

Paused

Create train.py

f7678e6 verified 12 months ago

1.21 kB

	import pandas as pd
	import faiss
	import requests
	import numpy as np

	# Load dataset
	df = pd.read_csv('/app/news_dataset.csv')

	# Function to create textual representation
	def create_textual_representation(row):
	return f"""
	الكاتب: {row['writer']},
	الموقع: {row['location']},
	التاريخ: {row['date']},
	الوقت: {row['time']},
	العنوان: {row['title']},
	الخبر: {row['news']}
	"""

	df['textual_representation'] = df.apply(create_textual_representation, axis=1)

	# FAISS setup
	dim = 4096
	index = faiss.IndexFlatL2(dim)
	x = np.zeros((len(df), dim), dtype='float32')

	# Generate embeddings using Llama 3.1 on Hugging Face API
	HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B"
	HEADERS = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}

	for i, text in enumerate(df['textual_representation']):
	print(f'Processing {i} instance')

	response = requests.post(HF_API_URL, headers=HEADERS, json={"inputs": text})
	embedding = response.json()[0]['embedding']

	x[i] = np.array(embedding)

	# Add embeddings to FAISS index
	index.add(x)

	# Save FAISS index
	faiss.write_index(index, "/app/arabic_news_index")