rishabh5752 commited on
Commit
f20b1e9
·
verified ·
1 Parent(s): d2180f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -109
app.py CHANGED
@@ -1,4 +1,18 @@
1
- import os, pathlib, tempfile, requests, json, textwrap, traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from functools import lru_cache
3
 
4
  import gradio as gr
@@ -9,150 +23,126 @@ from langchain.docstore.document import Document
9
  from transformers import pipeline
10
  import pypdf
11
 
12
- # ---------------------------------------------------------------------
13
- # 1️⃣  Reference corpus (add/remove as required)
14
- # ---------------------------------------------------------------------
15
  POLICY_URLS = {
16
- # 🇮🇳 India‑specific
17
- "DPDP Act 2023": "https://www.meity.gov.in/static/uploads/2024/06/2bf1f0e9f04e6fb4f8fef35e82c42aa5.pdf",
18
- "Responsible AI (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2021-08/Part2-Responsible-AI-12082021.pdf",
19
- "National AI Strategy (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2023-03/National-Strategy-for-Artificial-Intelligence.pdf",
20
- "IS 174281 (Data Privacy Assurance)": "https://archive.org/download/gov.in.is.17428.1.2020/gov.in.is.17428.1.2020.pdf",
21
- "RBI FREEAI Framework 2025": "https://assets.kpmg.com/content/dam/kpmgsites/in/pdf/2025/08/rbi-free-ai-committee-report-on-framework-for-responsible-and-ethical-enablement-of-artificial-intelligence.pdf.coredownload.inline.pdf",
22
-
23
- # 🌐 Global
24
- "OECD AI Principles": "https://oecd.ai/en/assets/files/OECD-LEGAL-0449-en.pdf",
25
- "EU AI Act (Reg. 2024/1689)": "https://eur-lex.europa.eu/resource.html?uri=cellar:99db59ed-3b7b-11ef-9e3c-01aa75ed71a1.0001.02/DOC_1&format=PDF", # consolidated text
26
- "ISO 42001 (AI MS)": "https://standards.iteh.ai/catalog/standards/iso/44d7188c-9cb8-4f0f-a358-06c7ce3e64f9/iso-iec-42001-2023.pdf",
27
- "ISO 23894 (AI Risk Mgmt)": "https://cdn.standards.iteh.ai/samples/77304/cb803ee4e9624430a5db177459158b24/ISO-IEC-23894-2023.pdf",
28
  }
29
 
30
  INDUSTRY_MAP = {
31
- "Finance": ["DPDP Act 2023", "RBI FREEAI Framework 2025", "IS 174281 (Data Privacy Assurance)", "OECD AI Principles"],
32
- "Health Care": ["DPDP Act 2023", "Responsible AI (NITI Aayog)", "ISO 23894 (AI Risk Mgmt)", "OECD AI Principles"],
33
- "ECommerce": ["DPDP Act 2023", "IS 174281 (Data Privacy Assurance)", "OECD AI Principles", "EU AI Act (Reg. 2024/1689)"],
34
  "All": list(POLICY_URLS.keys()),
35
  }
36
 
37
- # ---------------------------------------------------------------------
38
- # 2️⃣  Utility functions
39
- # ---------------------------------------------------------------------
40
 
41
- def download_file(url: str, path: pathlib.Path):
42
- if path.exists():
43
- return path
44
- path.parent.mkdir(parents=True, exist_ok=True)
45
- r = requests.get(url, timeout=60)
46
  r.raise_for_status()
47
- path.write_bytes(r.content)
48
- return path
49
 
50
 
51
- def extract_text_from_pdf(pdf_path: pathlib.Path) -> str:
52
- text = []
53
  with pdf_path.open("rb") as f:
54
  reader = pypdf.PdfReader(f)
55
  for page in reader.pages:
56
- txt = page.extract_text() or ""
57
- text.append(txt)
58
- return "\n".join(text)
59
 
60
 
61
  @lru_cache(maxsize=1)
62
- def build_vector_store(selected_sources=tuple(POLICY_URLS.keys())):
63
- print(" Building vector store …")
64
- documents = []
65
  splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
66
-
67
- for name in selected_sources:
68
- url = POLICY_URLS[name]
69
- pdf_path = pathlib.Path(tempfile.gettempdir()) / "policygpt" / f"{name}.pdf"
70
  try:
71
- download_file(url, pdf_path)
72
- raw_text = extract_text_from_pdf(pdf_path)
73
- chunks = splitter.split_text(raw_text)
74
- for chunk in chunks:
75
- documents.append(Document(page_content=chunk, metadata={"source": name}))
76
- print(f"✔ Loaded {name} ({len(chunks)} chunks)")
77
  except Exception as e:
78
- print(f"✖ Failed to process {name}: {e}")
79
-
80
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
81
- store = FAISS.from_documents(documents, embedding=embeddings)
82
- return store
83
-
84
- # Light‑weight generative model for answers
85
- qa_pipeline = pipeline(
86
- "text-generation",
87
- model="google/flan-t5-small",
88
- max_length=256,
89
- do_sample=False,
90
- )
91
-
92
- def rag_answer(question: str, industry: str = "All") -> str:
93
- # Build / get the store
94
- if industry == "All":
95
- store = build_vector_store(tuple(POLICY_URLS.keys()))
96
- else:
97
- store = build_vector_store(tuple(INDUSTRY_MAP[industry]))
98
-
99
- # Retrieve top‑k chunks
100
- docs = store.similarity_search(question, k=4)
101
- context = "\n\n".join([d.page_content for d in docs])
102
-
103
- prompt = textwrap.dedent(f"""\
104
- You are PolicyGPT, an assistant that answers queries about AI governance and data protection
105
- using the CONTEXT below. Provide concise, actionable guidance (≤150 words) and cite the
106
- policy source name in brackets. If the answer is not in context, say "I don’t know."
107
 
108
  CONTEXT:
109
  {context}
110
 
111
- Question: {question}
112
- Answer:
113
  """)
114
-
115
  try:
116
- response = qa_pipeline(prompt, truncate=True)[0]["generated_text"]
117
  except Exception as e:
118
- response = f"Error generating answer: {e}\n{traceback.format_exc()}"
119
- return response.strip()
120
 
121
- # Very naive risk scoring
122
- def compliance_score(answer: str) -> str:
123
- answer_low = answer.lower()
124
- if any(w in answer_low for w in ["prohibited", "penalty", "violation"]):
125
  return "High"
126
- if any(w in answer_low for w in ["must", "should", "shall"]):
127
  return "Medium"
128
  return "Low"
129
 
130
- # ---------------------------------------------------------------------
131
- # 3️⃣  Gradio UI
132
- # ---------------------------------------------------------------------
133
 
134
  def chat(question, industry):
135
- answer = rag_answer(question, industry)
136
- score = compliance_score(answer)
137
- return answer, f"Estimated compliance risk: **{score}**"
138
 
139
  with gr.Blocks(title="PolicyGPT 🇮🇳 (AI & Data Governance)") as demo:
140
- gr.Markdown(
141
- """
142
- # PolicyGPT 🇮🇳
143
- Ask anything about AI & Data Governance policies (DPDP Act, RBI FREE‑AI, ISO 42001, OECD, EU AI Act, etc.).
144
- """)
145
- with gr.Row():
146
- industry_dd = gr.Dropdown(
147
- choices=list(INDUSTRY_MAP.keys()),
148
- label="Select your industry",
149
- value="All",
150
- )
151
- user_input = gr.Textbox(label="Your question")
152
- answer_out = gr.Markdown()
153
- risk_out = gr.Markdown()
154
-
155
- user_input.submit(chat, [user_input, industry_dd], [answer_out, risk_out])
156
 
157
  if __name__ == "__main__":
158
  demo.launch()
 
1
+ # app.py PolicyGPT (Indian Edition) ✅ Bug-fixed
2
+ # --------------------------------------------------
3
+ # Quick start on Spaces:
4
+ # 1. Create a Gradio Space → drop this file as app.py
5
+ # 2. Add requirements.txt:
6
+ # gradio==4.21.0
7
+ # langchain==0.1.14
8
+ # sentence_transformers==2.7.0
9
+ # faiss-cpu==1.7.4
10
+ # pypdf==4.2.0
11
+ # transformers==4.40.2
12
+ # accelerate>=0.25.0
13
+ # 3. Commit → build (<10 min)
14
+
15
+ import pathlib, tempfile, textwrap, traceback, requests
16
  from functools import lru_cache
17
 
18
  import gradio as gr
 
23
  from transformers import pipeline
24
  import pypdf
25
 
26
+ # --------------------------------------------------
27
+ # 1️⃣ Reference corpus
28
+ # --------------------------------------------------
29
  POLICY_URLS = {
30
+ # 🇮🇳 India-centric
31
+ "DPDP Act 2023": "https://www.meity.gov.in/static/uploads/2024/06/2bf1f0e9f04e6fb4f8fef35e82c42aa5.pdf",
32
+ "Responsible AI (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2021-08/Part2-Responsible-AI-12082021.pdf",
33
+ "National AI Strategy (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2023-03/National-Strategy-for-Artificial-Intelligence.pdf",
34
+ "IS 17428-1 (Data Privacy Assurance)": "https://archive.org/download/gov.in.is.17428.1.2020/gov.in.is.17428.1.2020.pdf",
35
+ "RBI FREE-AI Framework 2025": "https://assets.kpmg.com/content/dam/kpmgsites/in/pdf/2025/08/rbi-free-ai-committee-report-on-framework-for-responsible-and-ethical-enablement-of-artificial-intelligence.pdf.coredownload.inline.pdf",
36
+
37
+ # 🌐 Global baseline
38
+ "OECD AI Principles": "https://oecd.ai/en/assets/files/OECD-LEGAL-0449-en.pdf",
39
+ "EU AI Act (Reg. 2024/1689)": "https://eur-lex.europa.eu/resource.html?uri=cellar:99db59ed-3b7b-11ef-9e3c-01aa75ed71a1.0001.02/DOC_1&format=PDF",
40
+ "ISO/IEC 42001:2023 (AI MS)": "https://standards.iteh.ai/catalog/standards/iso/44d7188c-9cb8-4f0f-a358-06c7ce3e64f9/iso-iec-42001-2023.pdf",
41
+ "ISO/IEC 23894:2023 (AI Risk Mgmt)": "https://cdn.standards.iteh.ai/samples/77304/cb803ee4e9624430a5db177459158b24/ISO-IEC-23894-2023.pdf",
42
  }
43
 
44
  INDUSTRY_MAP = {
45
+ "Finance": ["DPDP Act 2023", "RBI FREE-AI Framework 2025", "IS 17428-1 (Data Privacy Assurance)", "OECD AI Principles"],
46
+ "Health Care": ["DPDP Act 2023", "Responsible AI (NITI Aayog)", "ISO/IEC 23894:2023 (AI Risk Mgmt)", "OECD AI Principles"],
47
+ "E-Commerce": ["DPDP Act 2023", "IS 17428-1 (Data Privacy Assurance)", "OECD AI Principles", "EU AI Act (Reg. 2024/1689)"],
48
  "All": list(POLICY_URLS.keys()),
49
  }
50
 
51
+ # --------------------------------------------------
52
+ # 2️⃣ Helpers
53
+ # --------------------------------------------------
54
 
55
+ def download_file(url: str, dest: pathlib.Path):
56
+ if dest.exists():
57
+ return dest
58
+ dest.parent.mkdir(parents=True, exist_ok=True)
59
+ r = requests.get(url, timeout=120)
60
  r.raise_for_status()
61
+ dest.write_bytes(r.content)
62
+ return dest
63
 
64
 
65
+ def pdf_to_text(pdf_path: pathlib.Path) -> str:
66
+ out = []
67
  with pdf_path.open("rb") as f:
68
  reader = pypdf.PdfReader(f)
69
  for page in reader.pages:
70
+ out.append(page.extract_text() or "")
71
+ return "\n".join(out)
 
72
 
73
 
74
  @lru_cache(maxsize=1)
75
+ def build_store(sources=tuple(POLICY_URLS.keys())):
76
+ print("🔧 Building FAISS index …")
 
77
  splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
78
+ docs = []
79
+ for name in sources:
80
+ path = pathlib.Path(tempfile.gettempdir()) / "policygpt" / f"{name}.pdf"
 
81
  try:
82
+ download_file(POLICY_URLS[name], path)
83
+ text = pdf_to_text(path)
84
+ for chunk in splitter.split_text(text):
85
+ docs.append(Document(page_content=chunk, metadata={"source": name}))
86
+ print(f" {name}")
 
87
  except Exception as e:
88
+ print(f" {name}: {e}")
89
+ embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
90
+ return FAISS.from_documents(docs, embed)
91
+
92
+ # Mini generator (CPU-friendly)
93
+ GEN = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=200, do_sample=False)
94
+
95
+
96
+ def rag_answer(question: str, industry: str) -> str:
97
+ sel = tuple(POLICY_URLS.keys()) if industry == "All" else tuple(INDUSTRY_MAP[industry])
98
+ store = build_store(sel)
99
+ ctx_docs = store.similarity_search(question, k=4)
100
+ context = "\n\n".join(d.page_content for d in ctx_docs)[:3500] # keep prompt short
101
+
102
+ prompt = textwrap.dedent(f"""
103
+ You are PolicyGPT, an assistant on AI governance & data-protection.
104
+ Use CONTEXT below to answer QUESTION in ≤150 words. Cite source names in brackets.
105
+ If answer is unknown, say: I don’t know.
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  CONTEXT:
108
  {context}
109
 
110
+ QUESTION: {question}
111
+ ANSWER:
112
  """)
 
113
  try:
114
+ return GEN(prompt)[0]["generated_text"].split("ANSWER:")[-1].strip()
115
  except Exception as e:
116
+ return f"⚠️ Generation error: {e}\n{traceback.format_exc()}"
117
+
118
 
119
+ def risk_tag(text: str) -> str:
120
+ t = text.lower()
121
+ if any(k in t for k in ("violation", "prohibited", "penalty")):
 
122
  return "High"
123
+ if any(k in t for k in ("must", "should", "shall", "mandatory")):
124
  return "Medium"
125
  return "Low"
126
 
127
+ # --------------------------------------------------
128
+ # 3️⃣ UI
129
+ # --------------------------------------------------
130
 
131
  def chat(question, industry):
132
+ ans = rag_answer(question, industry)
133
+ return ans, f"**Estimated compliance risk:** {risk_tag(ans)}"
 
134
 
135
  with gr.Blocks(title="PolicyGPT 🇮🇳 (AI & Data Governance)") as demo:
136
+ gr.Markdown("""# PolicyGPT 🇮🇳\nAsk about Indian & global AI governance (DPDP, RBI FREE-AI, ISO 42001, EU AI Act …).""")
137
+
138
+ industry = gr.Dropdown(label="Select industry", choices=list(INDUSTRY_MAP.keys()), value="All")
139
+ qbox = gr.Textbox(lines=2, label="Your question", placeholder="e.g. Key patient-data rules for hospitals?")
140
+ btn = gr.Button("Ask")
141
+ answer = gr.Markdown()
142
+ risk = gr.Markdown()
143
+
144
+ btn.click(chat, [qbox, industry], [answer, risk])
145
+ qbox.submit(chat, [qbox, industry], [answer, risk])
 
 
 
 
 
 
146
 
147
  if __name__ == "__main__":
148
  demo.launch()