Spaces:

rishabh5752
/

Compliance_Chatbot

Sleeping

App Files Files Community

rishabh5752 commited on Sep 12, 2025

Commit

f20b1e9

verified ·

1 Parent(s): d2180f0

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -109

app.py CHANGED Viewed

@@ -1,4 +1,18 @@
-import os, pathlib, tempfile, requests, json, textwrap, traceback
 from functools import lru_cache
 import gradio as gr
@@ -9,150 +23,126 @@ from langchain.docstore.document import Document
 from transformers import pipeline
 import pypdf
-# ---------------------------------------------------------------------
-# 1️⃣  Reference corpus (add/remove as required)
-# ---------------------------------------------------------------------
 POLICY_URLS = {
-    # 🇮🇳 India‑specific
-    "DPDP Act 2023": "https://www.meity.gov.in/static/uploads/2024/06/2bf1f0e9f04e6fb4f8fef35e82c42aa5.pdf",
-    "Responsible AI (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2021-08/Part2-Responsible-AI-12082021.pdf",
-    "National AI Strategy (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2023-03/National-Strategy-for-Artificial-Intelligence.pdf",
-    "IS 17428‑1 (Data Privacy Assurance)": "https://archive.org/download/gov.in.is.17428.1.2020/gov.in.is.17428.1.2020.pdf",
-    "RBI FREE‑AI Framework 2025": "https://assets.kpmg.com/content/dam/kpmgsites/in/pdf/2025/08/rbi-free-ai-committee-report-on-framework-for-responsible-and-ethical-enablement-of-artificial-intelligence.pdf.coredownload.inline.pdf",
-    # 🌐 Global
-    "OECD AI Principles": "https://oecd.ai/en/assets/files/OECD-LEGAL-0449-en.pdf",
-    "EU AI Act (Reg. 2024/1689)": "https://eur-lex.europa.eu/resource.html?uri=cellar:99db59ed-3b7b-11ef-9e3c-01aa75ed71a1.0001.02/DOC_1&format=PDF",  # consolidated text
-    "ISO 42001 (AI MS)": "https://standards.iteh.ai/catalog/standards/iso/44d7188c-9cb8-4f0f-a358-06c7ce3e64f9/iso-iec-42001-2023.pdf",
-    "ISO 23894 (AI Risk Mgmt)": "https://cdn.standards.iteh.ai/samples/77304/cb803ee4e9624430a5db177459158b24/ISO-IEC-23894-2023.pdf",
 }
 INDUSTRY_MAP = {
-    "Finance": ["DPDP Act 2023", "RBI FREE‑AI Framework 2025", "IS 17428‑1 (Data Privacy Assurance)", "OECD AI Principles"],
-    "Health Care": ["DPDP Act 2023", "Responsible AI (NITI Aayog)", "ISO 23894 (AI Risk Mgmt)", "OECD AI Principles"],
-    "E‑Commerce": ["DPDP Act 2023", "IS 17428‑1 (Data Privacy Assurance)", "OECD AI Principles", "EU AI Act (Reg. 2024/1689)"],
     "All": list(POLICY_URLS.keys()),
 }
-# ---------------------------------------------------------------------
-# 2️⃣  Utility functions
-# ---------------------------------------------------------------------
-def download_file(url: str, path: pathlib.Path):
-    if path.exists():
-        return path
-    path.parent.mkdir(parents=True, exist_ok=True)
-    r = requests.get(url, timeout=60)
     r.raise_for_status()
-    path.write_bytes(r.content)
-    return path
-def extract_text_from_pdf(pdf_path: pathlib.Path) -> str:
-    text = []
     with pdf_path.open("rb") as f:
         reader = pypdf.PdfReader(f)
         for page in reader.pages:
-            txt = page.extract_text() or ""
-            text.append(txt)
-    return "\n".join(text)
 @lru_cache(maxsize=1)
-def build_vector_store(selected_sources=tuple(POLICY_URLS.keys())):
-    print("⏳ Building vector store …")
-    documents = []
     splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
-    for name in selected_sources:
-        url = POLICY_URLS[name]
-        pdf_path = pathlib.Path(tempfile.gettempdir()) / "policygpt" / f"{name}.pdf"
         try:
-            download_file(url, pdf_path)
-            raw_text = extract_text_from_pdf(pdf_path)
-            chunks = splitter.split_text(raw_text)
-            for chunk in chunks:
-                documents.append(Document(page_content=chunk, metadata={"source": name}))
-            print(f"✔  Loaded {name} ({len(chunks)} chunks)")
         except Exception as e:
-            print(f"✖  Failed to process {name}: {e}")
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    store = FAISS.from_documents(documents, embedding=embeddings)
-    return store
-# Light‑weight generative model for answers
-qa_pipeline = pipeline(
-    "text-generation",
-    model="google/flan-t5-small",
-    max_length=256,
-    do_sample=False,
-)
-def rag_answer(question: str, industry: str = "All") -> str:
-    # Build / get the store
-    if industry == "All":
-        store = build_vector_store(tuple(POLICY_URLS.keys()))
-    else:
-        store = build_vector_store(tuple(INDUSTRY_MAP[industry]))
-    # Retrieve top‑k chunks
-    docs = store.similarity_search(question, k=4)
-    context = "\n\n".join([d.page_content for d in docs])
-    prompt = textwrap.dedent(f"""\
-        You are PolicyGPT, an assistant that answers queries about AI governance and data protection
-        using the CONTEXT below. Provide concise, actionable guidance (≤150 words) and cite the
-        policy source name in brackets. If the answer is not in context, say "I don’t know."
         CONTEXT:
         {context}
-        Question: {question}
-        Answer:
     """)
     try:
-        response = qa_pipeline(prompt, truncate=True)[0]["generated_text"]
     except Exception as e:
-        response = f"Error generating answer: {e}\n{traceback.format_exc()}"
-    return response.strip()
-# Very naive risk scoring
-def compliance_score(answer: str) -> str:
-    answer_low = answer.lower()
-    if any(w in answer_low for w in ["prohibited", "penalty", "violation"]):
         return "High"
-    if any(w in answer_low for w in ["must", "should", "shall"]):
         return "Medium"
     return "Low"
-# ---------------------------------------------------------------------
-# 3️⃣  Gradio UI
-# ---------------------------------------------------------------------
 def chat(question, industry):
-    answer = rag_answer(question, industry)
-    score = compliance_score(answer)
-    return answer, f"Estimated compliance risk: **{score}**"
 with gr.Blocks(title="PolicyGPT 🇮🇳 (AI & Data Governance)") as demo:
-    gr.Markdown(
-        """
-        # PolicyGPT 🇮🇳
-        Ask anything about AI & Data Governance policies (DPDP Act, RBI FREE‑AI, ISO 42001, OECD, EU AI Act, etc.).
-        """)
-    with gr.Row():
-        industry_dd = gr.Dropdown(
-            choices=list(INDUSTRY_MAP.keys()),
-            label="Select your industry",
-            value="All",
-        )
-        user_input = gr.Textbox(label="Your question")
-    answer_out = gr.Markdown()
-    risk_out = gr.Markdown()
-    user_input.submit(chat, [user_input, industry_dd], [answer_out, risk_out])
 if __name__ == "__main__":
     demo.launch()

+# app.py – PolicyGPT (Indian Edition)  ✅ Bug-fixed
+# --------------------------------------------------
+# Quick start on Spaces:
+# 1. Create a Gradio Space → drop this file as app.py
+# 2. Add requirements.txt:
+#    gradio==4.21.0
+#    langchain==0.1.14
+#    sentence_transformers==2.7.0
+#    faiss-cpu==1.7.4
+#    pypdf==4.2.0
+#    transformers==4.40.2
+#    accelerate>=0.25.0
+# 3. Commit → build (<10 min)
+import pathlib, tempfile, textwrap, traceback, requests
 from functools import lru_cache
 import gradio as gr
 from transformers import pipeline
 import pypdf
+# --------------------------------------------------
+# 1️⃣  Reference corpus
+# --------------------------------------------------
 POLICY_URLS = {
+    # 🇮🇳 India-centric
+    "DPDP Act 2023": "https://www.meity.gov.in/static/uploads/2024/06/2bf1f0e9f04e6fb4f8fef35e82c42aa5.pdf",
+    "Responsible AI (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2021-08/Part2-Responsible-AI-12082021.pdf",
+    "National AI Strategy (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2023-03/National-Strategy-for-Artificial-Intelligence.pdf",
+    "IS 17428-1 (Data Privacy Assurance)": "https://archive.org/download/gov.in.is.17428.1.2020/gov.in.is.17428.1.2020.pdf",
+    "RBI FREE-AI Framework 2025": "https://assets.kpmg.com/content/dam/kpmgsites/in/pdf/2025/08/rbi-free-ai-committee-report-on-framework-for-responsible-and-ethical-enablement-of-artificial-intelligence.pdf.coredownload.inline.pdf",
+    # 🌐 Global baseline
+    "OECD AI Principles": "https://oecd.ai/en/assets/files/OECD-LEGAL-0449-en.pdf",
+    "EU AI Act (Reg. 2024/1689)": "https://eur-lex.europa.eu/resource.html?uri=cellar:99db59ed-3b7b-11ef-9e3c-01aa75ed71a1.0001.02/DOC_1&format=PDF",
+    "ISO/IEC 42001:2023 (AI MS)": "https://standards.iteh.ai/catalog/standards/iso/44d7188c-9cb8-4f0f-a358-06c7ce3e64f9/iso-iec-42001-2023.pdf",
+    "ISO/IEC 23894:2023 (AI Risk Mgmt)": "https://cdn.standards.iteh.ai/samples/77304/cb803ee4e9624430a5db177459158b24/ISO-IEC-23894-2023.pdf",
 }
 INDUSTRY_MAP = {
+    "Finance": ["DPDP Act 2023", "RBI FREE-AI Framework 2025", "IS 17428-1 (Data Privacy Assurance)", "OECD AI Principles"],
+    "Health Care": ["DPDP Act 2023", "Responsible AI (NITI Aayog)", "ISO/IEC 23894:2023 (AI Risk Mgmt)", "OECD AI Principles"],
+    "E-Commerce": ["DPDP Act 2023", "IS 17428-1 (Data Privacy Assurance)", "OECD AI Principles", "EU AI Act (Reg. 2024/1689)"],
     "All": list(POLICY_URLS.keys()),
 }
+# --------------------------------------------------
+# 2️⃣  Helpers
+# --------------------------------------------------
+def download_file(url: str, dest: pathlib.Path):
+    if dest.exists():
+        return dest
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    r = requests.get(url, timeout=120)
     r.raise_for_status()
+    dest.write_bytes(r.content)
+    return dest
+def pdf_to_text(pdf_path: pathlib.Path) -> str:
+    out = []
     with pdf_path.open("rb") as f:
         reader = pypdf.PdfReader(f)
         for page in reader.pages:
+            out.append(page.extract_text() or "")
+    return "\n".join(out)
 @lru_cache(maxsize=1)
+def build_store(sources=tuple(POLICY_URLS.keys())):
+    print("🔧 Building FAISS index …")
     splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
+    docs = []
+    for name in sources:
+        path = pathlib.Path(tempfile.gettempdir()) / "policygpt" / f"{name}.pdf"
         try:
+            download_file(POLICY_URLS[name], path)
+            text = pdf_to_text(path)
+            for chunk in splitter.split_text(text):
+                docs.append(Document(page_content=chunk, metadata={"source": name}))
+            print(f"✓ {name}")
         except Exception as e:
+            print(f"✗ {name}: {e}")
+    embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    return FAISS.from_documents(docs, embed)
+# Mini generator (CPU-friendly)
+GEN = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=200, do_sample=False)
+def rag_answer(question: str, industry: str) -> str:
+    sel = tuple(POLICY_URLS.keys()) if industry == "All" else tuple(INDUSTRY_MAP[industry])
+    store = build_store(sel)
+    ctx_docs = store.similarity_search(question, k=4)
+    context = "\n\n".join(d.page_content for d in ctx_docs)[:3500]  # keep prompt short
+    prompt = textwrap.dedent(f"""
+        You are PolicyGPT, an assistant on AI governance & data-protection.
+        Use CONTEXT below to answer QUESTION in ≤150 words. Cite source names in brackets.
+        If answer is unknown, say: I don’t know.
         CONTEXT:
         {context}
+        QUESTION: {question}
+        ANSWER:
     """)
     try:
+        return GEN(prompt)[0]["generated_text"].split("ANSWER:")[-1].strip()
     except Exception as e:
+        return f"⚠️ Generation error: {e}\n{traceback.format_exc()}"
+def risk_tag(text: str) -> str:
+    t = text.lower()
+    if any(k in t for k in ("violation", "prohibited", "penalty")):
         return "High"
+    if any(k in t for k in ("must", "should", "shall", "mandatory")):
         return "Medium"
     return "Low"
+# --------------------------------------------------
+# 3️⃣  UI
+# --------------------------------------------------
 def chat(question, industry):
+    ans = rag_answer(question, industry)
+    return ans, f"**Estimated compliance risk:** {risk_tag(ans)}"
 with gr.Blocks(title="PolicyGPT 🇮🇳 (AI & Data Governance)") as demo:
+    gr.Markdown("""# PolicyGPT 🇮🇳\nAsk about Indian & global AI governance (DPDP, RBI FREE-AI, ISO 42001, EU AI Act …).""")
+    industry = gr.Dropdown(label="Select industry", choices=list(INDUSTRY_MAP.keys()), value="All")
+    qbox = gr.Textbox(lines=2, label="Your question", placeholder="e.g. Key patient-data rules for hospitals?")
+    btn = gr.Button("Ask")
+    answer = gr.Markdown()
+    risk = gr.Markdown()
+    btn.click(chat, [qbox, industry], [answer, risk])
+    qbox.submit(chat, [qbox, industry], [answer, risk])
 if __name__ == "__main__":
     demo.launch()