| | import gradio as gr |
| | from sentence_transformers import SentenceTransformer |
| | from wikipediaapi import Wikipedia |
| | import textwrap |
| | import numpy as np |
| | from openai import OpenAI |
| |
|
| | |
| | def process_query(wiki_page, model_name, embed_dim, query, api_key): |
| | model_mapping = { |
| | "Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", |
| | "Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", |
| | "Arabert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", |
| | "Arabic-labse-Matryoshka": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", |
| | "Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka" |
| | } |
| |
|
| | model_path = model_mapping[model_name] |
| | model = SentenceTransformer(model_path, trust_remote_code=True, truncate_dim=embed_dim) |
| | wiki = Wikipedia('RAGBot/0.0', 'ar') |
| | doc = wiki.page(wiki_page).text |
| | paragraphs = doc.split('\n\n') |
| |
|
| | for i, p in enumerate(paragraphs): |
| | wrapped_text = textwrap.fill(p, width=100) |
| |
|
| | docs_embed = model.encode(paragraphs, normalize_embeddings=True) |
| | query_embed = model.encode(query, normalize_embeddings=True) |
| | similarities = np.dot(docs_embed, query_embed.T) |
| | top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist() |
| | most_similar_documents = [paragraphs[idx] for idx in top_3_idx] |
| |
|
| | CONTEXT = "" |
| | for i, p in enumerate(most_similar_documents): |
| | wrapped_text = textwrap.fill(p, width=100) |
| | CONTEXT += wrapped_text + "\n\n" |
| |
|
| | prompt = f""" |
| | use the following CONTEXT to answer the QUESTION at the end. |
| | If you don't know the answer, just say that you don't know, don't try to make up an answer. |
| | CONTEXT: {CONTEXT} |
| | QUESTION: {query} |
| | """ |
| |
|
| | client = OpenAI(api_key=api_key) |
| | response = client.chat.completions.create( |
| | model="gpt-4o", |
| | messages=[ |
| | {"role": "user", "content": prompt}, |
| | ] |
| | ) |
| |
|
| | return response.choices[0].message.content |
| |
|
| | |
| | wiki_page_input = gr.Textbox(label="Wikipedia Page (in Arabic)") |
| | query_input = gr.Textbox(label="Query (in Arabic)") |
| | api_key_input = gr.Textbox(label="OpenAI API Key", type="password") |
| |
|
| | model_choice = gr.Dropdown( |
| | choices=[ |
| | "Arabic-mpnet-base-all-nli-triplet", |
| | "Arabic-all-nli-triplet-Matryoshka", |
| | "Arabert-all-nli-triplet-Matryoshka", |
| | "Arabic-labse-Matryoshka", |
| | "Marbert-all-nli-triplet-Matryoshka" |
| | ], |
| | label="Choose Embedding Model" |
| | ) |
| |
|
| | embed_dim_choice = gr.Dropdown( |
| | choices=[768, 512, 256, 128, 64], |
| | label="Embedding Dimension" |
| | ) |
| |
|
| | output_text = gr.Textbox(label="Output") |
| |
|
| | gr.Interface( |
| | fn=process_query, |
| | inputs=[wiki_page_input, model_choice, embed_dim_choice, query_input, api_key_input], |
| | outputs=output_text, |
| | title="Arabic Wiki RAG", |
| | description="Choose a Wikipedia page, embedding model, and dimension to answer a query in Arabic." |
| | ).launch() |