import os import re import json import html import urllib.parse import urllib.request import gradio as gr from openai import OpenAI HF_TOKEN = os.environ.get("HF_TOKEN") MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-7B-Instruct") RETRIEVAL_API = os.environ.get( "RETRIEVAL_API", "https://alshargi-hadeethapi.hf.space/search" ) if not HF_TOKEN: raise ValueError("HF_TOKEN is missing. Add it in Space Settings -> Secrets.") client = OpenAI( base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN, ) GENERAL_SYSTEM_PROMPT = """ You are Rawi, a helpful AI assistant. - If the user asks who you are, who made you, or asks about Faisal Alshargi, say: "Hello! I'm Rawi, an AI assistant engineered by Dr. Faisal Alshargi to help with a wide range of tasks and questions. Whether you need information, advice, or just someone to chat with, I'm here to assist you. How can I help you today?" Your job: - Answer naturally and clearly - The user may ask about any topic - Do not force religious structure unless the user asks for it - Be flexible, modern, useful, and engaging - If the user asks for a list, bullets, short answer, or comparison, follow that format - Otherwise give a clean natural answer Avoid robotic section-heavy formatting unless the user explicitly asks for it. """.strip() HADITH_SYSTEM_PROMPT = """ You are Rawi Agent, a Hadith AI Agent specialized in explaining retrieved hadith evidence. The user message contains: 1) the user's actual request 2) retrieved hadith evidence from the retrieval API Your job: - Base the answer only on the retrieved hadith evidence - Do not invent hadiths, sources, grades, or unsupported claims - Keep the answer natural, clear, and useful - If the user asks for a list, bullets, short answer, comparison, or summary, follow that request - If the user does not specify a format, give one natural paragraph first - Keep the explanation faithful to the strongest retrieved evidence - If some retrieved hadiths are only loosely related, do not overstate them Do not use rigid headings like: - Short answer - Key meanings - Supporting evidence summary Prefer a natural answer style. """.strip() def is_arabic(text: str) -> bool: return bool(re.search(r"[\u0600-\u06FF]", text or "")) def normalize_quotes(text: str) -> str: if not text: return "" return ( text.replace("“", '"') .replace("”", '"') .replace("‘", "'") .replace("’", "'") ) def clean_general_answer(text: str) -> str: text = normalize_quotes(text or "").strip() patterns = [ r"^Answer:\s*", r"^AI Answer:\s*", r"^1\.\s*Short answer:\s*", r"^Short answer:\s*", r"\n?\s*2\.\s*Key meanings:\s*", r"\n?\s*3\.\s*Supporting evidence summary:\s*", r"\n?\s*Key meanings:\s*", r"\n?\s*Supporting evidence summary:\s*", ] for p in patterns: text = re.sub(p, " ", text, flags=re.IGNORECASE) text = re.sub(r"\n+\s*-\s*", " ", text) text = re.sub(r"\n{2,}", "\n\n", text) text = re.sub(r"\s{2,}", " ", text).strip() return text def clean_hadith_answer(text: str) -> str: text = clean_general_answer(text) text = re.sub(r"\s*Hadith Evidence:.*$", "", text, flags=re.IGNORECASE | re.DOTALL).strip() return text def fetch_hadith_sources(query: str, k: int = 5, rerank_k: int = 25) -> dict: params = { "q": query, "k": k, "rerank_k": rerank_k, "format": "json", "hl_topn": 0, "seg_maxlen": 220, } url = RETRIEVAL_API + "?" + urllib.parse.urlencode(params) with urllib.request.urlopen(url, timeout=45) as response: payload = response.read().decode("utf-8") data = json.loads(payload) # Flexible parsing in case API shape changes slightly if isinstance(data, dict): sources = data.get("sources") if isinstance(sources, list): return {"sources": sources, "retrieval_url": url} results = data.get("results") if isinstance(results, list): mapped = [] for item in results: mapped.append({ "source": item.get("source", item.get("collection", "Reference")), "grade": item.get("grade", "Unknown grade"), "text": item.get("text", ""), "english": item.get("english", ""), "score": item.get("score"), }) return {"sources": mapped, "retrieval_url": url} return {"sources": [], "retrieval_url": url} def format_sources_for_prompt(sources: list[dict]) -> str: if not sources: return "No hadith evidence was retrieved." blocks = [] for i, src in enumerate(sources, start=1): source = src.get("source", "Reference") grade = src.get("grade", "Unknown grade") arabic_text = src.get("text", "") english = src.get("english", "") score = src.get("score", None) block = [ f"Hadith {i}", f"Source: {source}", f"Grade: {grade}", ] if score is not None: block.append(f"Score: {score}") if arabic_text: block.append(f"Arabic: {arabic_text}") if english: block.append(f"English: {english}") blocks.append("\n".join(block)) return "\n\n".join(blocks) def format_sources_for_display(sources: list[dict], language: str = "en") -> str: if not sources: return "" if language == "ar": title = "الأحاديث المسترجعة" grade_label = "الدرجة" arabic_label = "النص العربي" english_label = "الترجمة الإنجليزية" score_label = "الدرجة العددية" else: title = "Hadith Evidence" grade_label = "Grade" arabic_label = "Arabic" english_label = "English" score_label = "Score" parts = [title] for src in sources: source = src.get("source", "Reference") grade = src.get("grade", "Unknown grade") arabic_text = src.get("text", "") english = src.get("english", "") score = src.get("score", None) block = [source, f"{grade_label}: {grade}"] if arabic_text: block.append(f"{arabic_label}: {arabic_text}") if english: block.append(f"{english_label}: {english}") if score is not None: try: block.append(f"{score_label}: {float(score):.4f}") except Exception: block.append(f"{score_label}: {score}") parts.append("\n".join(block)) return "\n\n".join(parts) def build_general_messages(user_message: str, history: list[dict]) -> list[dict]: messages = [{"role": "system", "content": GENERAL_SYSTEM_PROMPT}] messages.extend(history) messages.append({"role": "user", "content": user_message}) return messages def build_hadith_messages(user_message: str, history: list[dict], sources: list[dict]) -> list[dict]: retrieved_text = format_sources_for_prompt(sources) wrapped_user_message = f""" User request: {user_message} Retrieved hadith evidence: {retrieved_text} """.strip() messages = [{"role": "system", "content": HADITH_SYSTEM_PROMPT}] messages.extend(history) messages.append({"role": "user", "content": wrapped_user_message}) return messages def llm_chat(messages: list[dict], temperature: float = 0.2, max_tokens: int = 1000) -> str: response = client.chat.completions.create( model=MODEL_ID, messages=messages, temperature=temperature, max_tokens=max_tokens, ) return response.choices[0].message.content.strip() def build_history_messages(history_pairs: list[tuple[str, str]]) -> list[dict]: messages = [] for user_msg, assistant_msg in history_pairs: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) return messages def chat(message: str, history: list[tuple[str, str]], use_rag: bool): if not message or not message.strip(): return "Please enter a message." history_messages = build_history_messages(history) language = "ar" if is_arabic(message) else "en" try: if use_rag: retrieval = fetch_hadith_sources(message) sources = retrieval.get("sources", []) messages = build_hadith_messages(message, history_messages, sources) answer = llm_chat(messages, temperature=0.15, max_tokens=1100) answer = clean_hadith_answer(answer) evidence = format_sources_for_display(sources, language=language) final = answer.strip() if evidence: final = f"{final}\n\n{evidence}" return final messages = build_general_messages(message, history_messages) answer = llm_chat(messages, temperature=0.3, max_tokens=1000) return clean_general_answer(answer) except Exception as e: return f"Error: {str(e)}" CUSTOM_CSS = """ .gradio-container{ max-width: 1100px !important; margin: 0 auto !important; } #title-wrap{ text-align:center; margin-bottom: 8px; } #title-wrap h1{ margin-bottom: 6px; } .mode-note{ font-size: 13px; color: #5f7296; } """ with gr.Blocks(css=CUSTOM_CSS, title="Rawi Agent — Hadith AI Agent") as demo: gr.HTML("""