Spaces:

Alshargi
/

Hadithi

Build error

File size: 11,089 Bytes

import os
import re
import json
import html
import urllib.parse
import urllib.request
import gradio as gr
from openai import OpenAI

HF_TOKEN = os.environ.get("HF_TOKEN")
MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
RETRIEVAL_API = os.environ.get(
    "RETRIEVAL_API",
    "https://alshargi-hadeethapi.hf.space/search"
)

if not HF_TOKEN:
    raise ValueError("HF_TOKEN is missing. Add it in Space Settings -> Secrets.")

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=HF_TOKEN,
)

GENERAL_SYSTEM_PROMPT = """
You are Rawi, a helpful AI assistant.

- If the user asks who you are, who made you, or asks about Faisal Alshargi, say:
  "Hello! I'm Rawi, an AI assistant engineered by Dr. Faisal Alshargi to help with a wide range of tasks and questions. Whether you need information, advice, or just someone to chat with, I'm here to assist you. How can I help you today?"
  
Your job:
- Answer naturally and clearly
- The user may ask about any topic
- Do not force religious structure unless the user asks for it
- Be flexible, modern, useful, and engaging
- If the user asks for a list, bullets, short answer, or comparison, follow that format
- Otherwise give a clean natural answer

Avoid robotic section-heavy formatting unless the user explicitly asks for it.
""".strip()

HADITH_SYSTEM_PROMPT = """
You are Rawi Agent, a Hadith AI Agent specialized in explaining retrieved hadith evidence.

The user message contains:
1) the user's actual request
2) retrieved hadith evidence from the retrieval API

Your job:
- Base the answer only on the retrieved hadith evidence
- Do not invent hadiths, sources, grades, or unsupported claims
- Keep the answer natural, clear, and useful
- If the user asks for a list, bullets, short answer, comparison, or summary, follow that request
- If the user does not specify a format, give one natural paragraph first
- Keep the explanation faithful to the strongest retrieved evidence
- If some retrieved hadiths are only loosely related, do not overstate them

Do not use rigid headings like:
- Short answer
- Key meanings
- Supporting evidence summary

Prefer a natural answer style.
""".strip()


def is_arabic(text: str) -> bool:
    return bool(re.search(r"[\u0600-\u06FF]", text or ""))


def normalize_quotes(text: str) -> str:
    if not text:
        return ""
    return (
        text.replace("“", '"')
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
    )


def clean_general_answer(text: str) -> str:
    text = normalize_quotes(text or "").strip()

    patterns = [
        r"^Answer:\s*",
        r"^AI Answer:\s*",
        r"^1\.\s*Short answer:\s*",
        r"^Short answer:\s*",
        r"\n?\s*2\.\s*Key meanings:\s*",
        r"\n?\s*3\.\s*Supporting evidence summary:\s*",
        r"\n?\s*Key meanings:\s*",
        r"\n?\s*Supporting evidence summary:\s*",
    ]
    for p in patterns:
        text = re.sub(p, " ", text, flags=re.IGNORECASE)

    text = re.sub(r"\n+\s*-\s*", " ", text)
    text = re.sub(r"\n{2,}", "\n\n", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    return text


def clean_hadith_answer(text: str) -> str:
    text = clean_general_answer(text)
    text = re.sub(r"\s*Hadith Evidence:.*$", "", text, flags=re.IGNORECASE | re.DOTALL).strip()
    return text


def fetch_hadith_sources(query: str, k: int = 5, rerank_k: int = 25) -> dict:
    params = {
        "q": query,
        "k": k,
        "rerank_k": rerank_k,
        "format": "json",
        "hl_topn": 0,
        "seg_maxlen": 220,
    }
    url = RETRIEVAL_API + "?" + urllib.parse.urlencode(params)

    with urllib.request.urlopen(url, timeout=45) as response:
        payload = response.read().decode("utf-8")

    data = json.loads(payload)

    # Flexible parsing in case API shape changes slightly
    if isinstance(data, dict):
        sources = data.get("sources")
        if isinstance(sources, list):
            return {"sources": sources, "retrieval_url": url}

        results = data.get("results")
        if isinstance(results, list):
            mapped = []
            for item in results:
                mapped.append({
                    "source": item.get("source", item.get("collection", "Reference")),
                    "grade": item.get("grade", "Unknown grade"),
                    "text": item.get("text", ""),
                    "english": item.get("english", ""),
                    "score": item.get("score"),
                })
            return {"sources": mapped, "retrieval_url": url}

    return {"sources": [], "retrieval_url": url}


def format_sources_for_prompt(sources: list[dict]) -> str:
    if not sources:
        return "No hadith evidence was retrieved."

    blocks = []
    for i, src in enumerate(sources, start=1):
        source = src.get("source", "Reference")
        grade = src.get("grade", "Unknown grade")
        arabic_text = src.get("text", "")
        english = src.get("english", "")
        score = src.get("score", None)

        block = [
            f"Hadith {i}",
            f"Source: {source}",
            f"Grade: {grade}",
        ]
        if score is not None:
            block.append(f"Score: {score}")
        if arabic_text:
            block.append(f"Arabic: {arabic_text}")
        if english:
            block.append(f"English: {english}")

        blocks.append("\n".join(block))

    return "\n\n".join(blocks)


def format_sources_for_display(sources: list[dict], language: str = "en") -> str:
    if not sources:
        return ""

    if language == "ar":
        title = "الأحاديث المسترجعة"
        grade_label = "الدرجة"
        arabic_label = "النص العربي"
        english_label = "الترجمة الإنجليزية"
        score_label = "الدرجة العددية"
    else:
        title = "Hadith Evidence"
        grade_label = "Grade"
        arabic_label = "Arabic"
        english_label = "English"
        score_label = "Score"

    parts = [title]
    for src in sources:
        source = src.get("source", "Reference")
        grade = src.get("grade", "Unknown grade")
        arabic_text = src.get("text", "")
        english = src.get("english", "")
        score = src.get("score", None)

        block = [source, f"{grade_label}: {grade}"]
        if arabic_text:
            block.append(f"{arabic_label}: {arabic_text}")
        if english:
            block.append(f"{english_label}: {english}")
        if score is not None:
            try:
                block.append(f"{score_label}: {float(score):.4f}")
            except Exception:
                block.append(f"{score_label}: {score}")

        parts.append("\n".join(block))

    return "\n\n".join(parts)


def build_general_messages(user_message: str, history: list[dict]) -> list[dict]:
    messages = [{"role": "system", "content": GENERAL_SYSTEM_PROMPT}]
    messages.extend(history)
    messages.append({"role": "user", "content": user_message})
    return messages


def build_hadith_messages(user_message: str, history: list[dict], sources: list[dict]) -> list[dict]:
    retrieved_text = format_sources_for_prompt(sources)
    wrapped_user_message = f"""
User request:
{user_message}

Retrieved hadith evidence:
{retrieved_text}
""".strip()

    messages = [{"role": "system", "content": HADITH_SYSTEM_PROMPT}]
    messages.extend(history)
    messages.append({"role": "user", "content": wrapped_user_message})
    return messages


def llm_chat(messages: list[dict], temperature: float = 0.2, max_tokens: int = 1000) -> str:
    response = client.chat.completions.create(
        model=MODEL_ID,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return response.choices[0].message.content.strip()


def build_history_messages(history_pairs: list[tuple[str, str]]) -> list[dict]:
    messages = []
    for user_msg, assistant_msg in history_pairs:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    return messages


def chat(message: str, history: list[tuple[str, str]], use_rag: bool):
    if not message or not message.strip():
        return "Please enter a message."

    history_messages = build_history_messages(history)
    language = "ar" if is_arabic(message) else "en"

    try:
        if use_rag:
            retrieval = fetch_hadith_sources(message)
            sources = retrieval.get("sources", [])

            messages = build_hadith_messages(message, history_messages, sources)
            answer = llm_chat(messages, temperature=0.15, max_tokens=1100)
            answer = clean_hadith_answer(answer)

            evidence = format_sources_for_display(sources, language=language)
            final = answer.strip()
            if evidence:
                final = f"{final}\n\n{evidence}"

            return final

        messages = build_general_messages(message, history_messages)
        answer = llm_chat(messages, temperature=0.3, max_tokens=1000)
        return clean_general_answer(answer)

    except Exception as e:
        return f"Error: {str(e)}"


CUSTOM_CSS = """
.gradio-container{
  max-width: 1100px !important;
  margin: 0 auto !important;
}
#title-wrap{
  text-align:center;
  margin-bottom: 8px;
}
#title-wrap h1{
  margin-bottom: 6px;
}
.mode-note{
  font-size: 13px;
  color: #5f7296;
}
"""

with gr.Blocks(css=CUSTOM_CSS, title="Rawi Agent — Hadith AI Agent") as demo:
    gr.HTML("""
    <div id="title-wrap">
      <h1>Rawi Agent</h1>
      <div class="mode-note">General chat by default. Enable hadith evidence when you want retrieved hadith support.</div>
    </div>
    """)

    chatbot = gr.Chatbot(
        label="Rawi",
        height=600,
        bubble_full_width=False,
    )

    with gr.Row():
        with gr.Column(scale=8):
            msg = gr.Textbox(
                placeholder="Ask about anything...",
                lines=3,
                max_lines=8,
                show_label=False,
            )
        with gr.Column(scale=2, min_width=180):
            use_rag = gr.Checkbox(
                label="Include Hadith Evidence",
                value=False,
            )
            send = gr.Button("Send", variant="primary")
            clear = gr.Button("Clear")

    state = gr.State([])

    def submit_message(user_message, chat_history, rag_enabled):
        response = chat(user_message, chat_history, rag_enabled)
        chat_history = chat_history + [(user_message, response)]
        return "", chat_history, chat_history

    send.click(
        submit_message,
        inputs=[msg, state, use_rag],
        outputs=[msg, chatbot, state],
    )

    msg.submit(
        submit_message,
        inputs=[msg, state, use_rag],
        outputs=[msg, chatbot, state],
    )

    clear.click(
        lambda: ([], []),
        outputs=[chatbot, state],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)