import pandas as pd import faiss import requests import numpy as np # Load dataset df = pd.read_csv('/app/news_dataset.csv') # Function to create textual representation def create_textual_representation(row): return f""" الكاتب: {row['writer']}, الموقع: {row['location']}, التاريخ: {row['date']}, الوقت: {row['time']}, العنوان: {row['title']}, الخبر: {row['news']} """ df['textual_representation'] = df.apply(create_textual_representation, axis=1) # FAISS setup dim = 4096 index = faiss.IndexFlatL2(dim) x = np.zeros((len(df), dim), dtype='float32') # Generate embeddings using Llama 3.1 on Hugging Face API HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B" HEADERS = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"} for i, text in enumerate(df['textual_representation']): print(f'Processing {i} instance') response = requests.post(HF_API_URL, headers=HEADERS, json={"inputs": text}) embedding = response.json()[0]['embedding'] x[i] = np.array(embedding) # Add embeddings to FAISS index index.add(x) # Save FAISS index faiss.write_index(index, "/app/arabic_news_index")