File size: 1,213 Bytes
f7678e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
import faiss
import requests
import numpy as np

# Load dataset
df = pd.read_csv('/app/news_dataset.csv')

# Function to create textual representation
def create_textual_representation(row):
    return f""" 
    الكاتب: {row['writer']},
    الموقع: {row['location']},
    التاريخ: {row['date']},
    الوقت: {row['time']},
    العنوان: {row['title']},
    الخبر: {row['news']}
    """

df['textual_representation'] = df.apply(create_textual_representation, axis=1)

# FAISS setup
dim = 4096
index = faiss.IndexFlatL2(dim)
x = np.zeros((len(df), dim), dtype='float32')

# Generate embeddings using Llama 3.1 on Hugging Face API
HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B"
HEADERS = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}

for i, text in enumerate(df['textual_representation']):
    print(f'Processing {i} instance')
    
    response = requests.post(HF_API_URL, headers=HEADERS, json={"inputs": text})
    embedding = response.json()[0]['embedding']
    
    x[i] = np.array(embedding)

# Add embeddings to FAISS index
index.add(x)

# Save FAISS index
faiss.write_index(index, "/app/arabic_news_index")