import requests import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM my_token = os.getenv('my_repo_token') def find_most_relevant_context(contexts, question, max_features=10000): # Vectorize contexts and question with limited features tfidf_vectorizer = TfidfVectorizer(max_features=max_features) tfidf_matrix = tfidf_vectorizer.fit_transform([question] + contexts) # Compute cosine similarity between question and contexts similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten() # Get index of context with highest similarity most_relevant_index = similarity_scores.argmax() return contexts[most_relevant_index] def load_and_prepare_dataset(file_path): dataset = load_dataset('json', data_files=file_path, field='data')['train'] return dataset # Load dataset and get question and contexts dataset = load_and_prepare_dataset('./train.json') Question = dataset[0]['paragraphs'][0]['qas'][0]['question'] contexts = [dataset[i]['paragraphs'][0]['context'] for i in range(min(3, len(dataset)))] # Find the most relevant context most_relevant_context = find_most_relevant_context(contexts, Question) # Create the instruction for the model instruction = most_relevant_context[:300] + " " + Question API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2" headers = {"Authorization": f"Bearer {my_token}"} def query(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.json() output = query({ "inputs": instruction, }) print(output)