Spaces:
Paused
Paused
| import pandas as pd | |
| import faiss | |
| import requests | |
| import numpy as np | |
| # Load dataset | |
| df = pd.read_csv('/app/news_dataset.csv') | |
| # Function to create textual representation | |
| def create_textual_representation(row): | |
| return f""" | |
| الكاتب: {row['writer']}, | |
| الموقع: {row['location']}, | |
| التاريخ: {row['date']}, | |
| الوقت: {row['time']}, | |
| العنوان: {row['title']}, | |
| الخبر: {row['news']} | |
| """ | |
| df['textual_representation'] = df.apply(create_textual_representation, axis=1) | |
| # FAISS setup | |
| dim = 4096 | |
| index = faiss.IndexFlatL2(dim) | |
| x = np.zeros((len(df), dim), dtype='float32') | |
| # Generate embeddings using Llama 3.1 on Hugging Face API | |
| HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B" | |
| HEADERS = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"} | |
| for i, text in enumerate(df['textual_representation']): | |
| print(f'Processing {i} instance') | |
| response = requests.post(HF_API_URL, headers=HEADERS, json={"inputs": text}) | |
| embedding = response.json()[0]['embedding'] | |
| x[i] = np.array(embedding) | |
| # Add embeddings to FAISS index | |
| index.add(x) | |
| # Save FAISS index | |
| faiss.write_index(index, "/app/arabic_news_index") | |