Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import faiss | |
| import numpy as np | |
| import torch | |
| import requests | |
| import os | |
| #import huggingface_hub | |
| hf_token = os.getenv("hf_token") | |
| #huggingface_hub.login(hf_token) | |
| df = pd.read_excel("Allam_SA_Articles.xlsx") | |
| input_texts = df['Article_text'].tolist() | |
| MOJ_embeddings = np.load('Allam_embeddings.npy') | |
| def embed_single_text(query): | |
| headers = { | |
| "Authorization": f"Bearer {hf_token}" | |
| } | |
| url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}" | |
| response = requests.get(url, headers=headers) | |
| if response.status_code == 200: | |
| return torch.tensor(response.json()) | |
| else: | |
| print(f"Error: {response.status_code}") | |
| return None | |
| #Faiss | |
| dimension = MOJ_embeddings.shape[1] | |
| index = faiss.IndexFlatIP(dimension) | |
| index.add(MOJ_embeddings) | |
| def query_search(query, K): | |
| query_embedding = embed_single_text(query) | |
| distances, indices = index.search(query_embedding, K) | |
| results = [] | |
| for idx in indices[0]: | |
| file_id = df.iloc[idx]['File_ID'] | |
| row_number = df.iloc[idx]['Row_Number'] | |
| #results.append((file_id, row_number)) | |
| results.append(idx) | |
| return results | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| def return_top5_chunks(query): | |
| matching_indices = query_search(query, 15) | |
| relevant_rows = df.iloc[matching_indices] | |
| def chunk_text(text, max_words=150): | |
| words = text.split() | |
| return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)] | |
| relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text) | |
| chunked_texts = [] | |
| for idx, row in relevant_rows.iterrows(): | |
| for chunk in row['Chunks']: | |
| chunked_texts.append((chunk, idx)) | |
| def find_top_k_similar(texts, query, k): | |
| documents = [text for text, _ in texts] | |
| vectorizer = TfidfVectorizer() | |
| all_texts = documents + [query] | |
| tfidf_matrix = vectorizer.fit_transform(all_texts) | |
| similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten() | |
| top_k_indices = similarities.argsort()[-k:][::-1] | |
| return [(texts[i], similarities[i]) for i in top_k_indices] | |
| top_5_chunks = find_top_k_similar(chunked_texts, query, 5) | |
| chunks_txt = '' | |
| for i, ((chunk, idx), similarity) in enumerate(top_5_chunks): | |
| chunks_txt += f"Index: {idx},\nChunk: {chunk}\n" | |
| if i < len(top_5_chunks) - 1: | |
| chunks_txt += "##########\n" | |
| return chunks_txt | |
| import requests | |
| api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC' | |
| url = "https://iam.cloud.ibm.com/identity/token" | |
| headers = { | |
| "Content-Type": "application/x-www-form-urlencoded" | |
| } | |
| data = { | |
| "grant_type": "urn:ibm:params:oauth:grant-type:apikey", | |
| "apikey": api_key | |
| } | |
| response = requests.post(url, headers=headers, data=data) | |
| token_info = response.json() | |
| access_token = token_info['access_token'] | |
| def allam_response(context, query): | |
| url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29" | |
| input_text_base = f""" | |
| [Context]: {context} | |
| [System]: | |
| You are an Arabic frindley chatbot named مستنير. | |
| You will be provided with an Arabic context , | |
| Your task is to extract and Answer for the questions only from the context provided | |
| elaborate on the answer from the context | |
| At the end of your response mention the Article : مادة | |
| if no answer is found apologize | |
| Question: {query} | |
| """ | |
| body = { | |
| "input": input_text_base, | |
| "parameters": { | |
| "decoding_method": "greedy", | |
| "max_new_tokens": 900, | |
| "min_new_tokens": 0, | |
| "stop_sequences": [], | |
| "repetition_penalty": 1 | |
| }, | |
| "model_id": "sdaia/allam-1-13b-instruct", | |
| "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936" | |
| } | |
| headers = { | |
| "Accept": "application/json", | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {access_token}" | |
| } | |
| response = requests.post(url, headers=headers, json=body) | |
| if response.status_code != 200: | |
| raise Exception("Non-200 response: " + str(response.text)) | |
| response = response.json() | |
| return response['results'][0]['generated_text'] | |
| import json | |
| import re | |
| def index_num(text): | |
| match = re.search(r'"Index":\s*"(\d+)"', text) | |
| index_number = match.group(1) if match else None | |
| return int(index_number) | |
| def get_top_matching_chunk(text, query, max_words=500): | |
| def chunk_text(text, max_words): | |
| words = text.split() | |
| return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)] | |
| chunks = chunk_text(text, max_words) | |
| vectorizer = TfidfVectorizer() | |
| all_texts = chunks + [query] | |
| tfidf_matrix = vectorizer.fit_transform(all_texts) | |
| similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten() | |
| top_chunk_index = similarities.argmax() | |
| return chunks[top_chunk_index] | |
| def reformat_indentation(text, indent_spaces=4): | |
| indent = ' ' * indent_spaces | |
| lines = text.splitlines() | |
| formatted_lines = [indent + line.strip() for line in lines] | |
| return '\n'.join(formatted_lines) | |
| def return_index_num(data_text, query): | |
| url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29" | |
| sys_prompt = """ | |
| Identify the **first** Index chunk with the answer to a given question. | |
| Chunks are seperated by ########## | |
| Respond only with **Json** format **do not return any words**: | |
| {"Index": "extracted_Index"} | |
| Or: | |
| {"Index": "not_found"} | |
| **No additional text allowed**. | |
| """ | |
| sys_prompt += f"Question : {query}" | |
| input_text = f""" | |
| [Context]: {data_text.strip()} | |
| [System]: {sys_prompt.strip()} | |
| """ | |
| input_text = reformat_indentation(input_text, indent_spaces=0) | |
| body = { | |
| "input": input_text, | |
| "parameters": { | |
| "decoding_method": "greedy", | |
| "max_new_tokens": 20, | |
| "repetition_penalty": 1 | |
| }, | |
| "model_id": "sdaia/allam-1-13b-instruct", | |
| "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936" | |
| } | |
| headers = { | |
| "Accept": "application/json", | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {access_token}" # access_token must be defined elsewhere | |
| } | |
| response = requests.post(url, headers=headers, json=body) | |
| if response.status_code != 200: | |
| raise Exception("Non-200 response: " + str(response.text)) | |
| response = response.json() | |
| return(response['results'][0]['generated_text']) | |
| def allam_llm(q): | |
| chunks_text = return_top5_chunks(q) | |
| targeted_chunk = return_index_num(chunks_text, q) | |
| index_number = index_num(targeted_chunk) | |
| text_to_chunk = df['Article_text'][index_number] | |
| top_chunk = get_top_matching_chunk(text_to_chunk, q) | |
| allam_res = allam_response(top_chunk, q) | |
| return allam_res |