File size: 1,763 Bytes
33a10cf
0456c8a
33a10cf
99101a1
 
 
38320c3
6f30cc4
0456c8a
 
 
99101a1
 
 
 
 
 
 
 
 
 
0456c8a
 
 
 
 
 
 
 
99101a1
0456c8a
 
99101a1
0456c8a
 
99101a1
 
 
33a10cf
 
99101a1
33a10cf
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import requests
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
my_token = os.getenv('my_repo_token')
def find_most_relevant_context(contexts, question, max_features=10000):
    # Vectorize contexts and question with limited features
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts)
    
    # Compute cosine similarity between question and contexts
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    
    # Get index of context with highest similarity
    most_relevant_index = similarity_scores.argmax()
    
    return contexts[most_relevant_index]

def load_and_prepare_dataset(file_path):
    dataset = load_dataset('json', data_files=file_path, field='data')['train']
    return dataset

# Load dataset and get question and contexts
dataset = load_and_prepare_dataset('./train.json')
Question = dataset[0]['paragraphs'][0]['qas'][0]['question']
contexts = [dataset[i]['paragraphs'][0]['context'] for i in range(min(3, len(dataset)))]

# Find the most relevant context
most_relevant_context = find_most_relevant_context(contexts, Question)

# Create the instruction for the model
instruction = most_relevant_context[:300] + " " + Question



API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
headers = {"Authorization": f"Bearer {my_token}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": instruction,
})
print(output)