Noor22Tak commited on
Commit
f7678e6
·
verified ·
1 Parent(s): 1849627

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +43 -0
train.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import faiss
3
+ import requests
4
+ import numpy as np
5
+
6
+ # Load dataset
7
+ df = pd.read_csv('/app/news_dataset.csv')
8
+
9
+ # Function to create textual representation
10
+ def create_textual_representation(row):
11
+ return f"""
12
+ الكاتب: {row['writer']},
13
+ الموقع: {row['location']},
14
+ التاريخ: {row['date']},
15
+ الوقت: {row['time']},
16
+ العنوان: {row['title']},
17
+ الخبر: {row['news']}
18
+ """
19
+
20
+ df['textual_representation'] = df.apply(create_textual_representation, axis=1)
21
+
22
+ # FAISS setup
23
+ dim = 4096
24
+ index = faiss.IndexFlatL2(dim)
25
+ x = np.zeros((len(df), dim), dtype='float32')
26
+
27
+ # Generate embeddings using Llama 3.1 on Hugging Face API
28
+ HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B"
29
+ HEADERS = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
30
+
31
+ for i, text in enumerate(df['textual_representation']):
32
+ print(f'Processing {i} instance')
33
+
34
+ response = requests.post(HF_API_URL, headers=HEADERS, json={"inputs": text})
35
+ embedding = response.json()[0]['embedding']
36
+
37
+ x[i] = np.array(embedding)
38
+
39
+ # Add embeddings to FAISS index
40
+ index.add(x)
41
+
42
+ # Save FAISS index
43
+ faiss.write_index(index, "/app/arabic_news_index")