rishabh5752 commited on
Commit
d2180f0
·
verified ·
1 Parent(s): a970da9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -0
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, pathlib, tempfile, requests, json, textwrap, traceback
2
+ from functools import lru_cache
3
+
4
+ import gradio as gr
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.docstore.document import Document
9
+ from transformers import pipeline
10
+ import pypdf
11
+
12
+ # ---------------------------------------------------------------------
13
+ # 1️⃣  Reference corpus (add/remove as required)
14
+ # ---------------------------------------------------------------------
15
+ POLICY_URLS = {
16
+ # 🇮🇳 India‑specific
17
+ "DPDP Act 2023": "https://www.meity.gov.in/static/uploads/2024/06/2bf1f0e9f04e6fb4f8fef35e82c42aa5.pdf",
18
+ "Responsible AI (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2021-08/Part2-Responsible-AI-12082021.pdf",
19
+ "National AI Strategy (NITI Aayog)": "https://www.niti.gov.in/sites/default/files/2023-03/National-Strategy-for-Artificial-Intelligence.pdf",
20
+ "IS 17428‑1 (Data Privacy Assurance)": "https://archive.org/download/gov.in.is.17428.1.2020/gov.in.is.17428.1.2020.pdf",
21
+ "RBI FREE‑AI Framework 2025": "https://assets.kpmg.com/content/dam/kpmgsites/in/pdf/2025/08/rbi-free-ai-committee-report-on-framework-for-responsible-and-ethical-enablement-of-artificial-intelligence.pdf.coredownload.inline.pdf",
22
+
23
+ # 🌐 Global
24
+ "OECD AI Principles": "https://oecd.ai/en/assets/files/OECD-LEGAL-0449-en.pdf",
25
+ "EU AI Act (Reg. 2024/1689)": "https://eur-lex.europa.eu/resource.html?uri=cellar:99db59ed-3b7b-11ef-9e3c-01aa75ed71a1.0001.02/DOC_1&format=PDF", # consolidated text
26
+ "ISO 42001 (AI MS)": "https://standards.iteh.ai/catalog/standards/iso/44d7188c-9cb8-4f0f-a358-06c7ce3e64f9/iso-iec-42001-2023.pdf",
27
+ "ISO 23894 (AI Risk Mgmt)": "https://cdn.standards.iteh.ai/samples/77304/cb803ee4e9624430a5db177459158b24/ISO-IEC-23894-2023.pdf",
28
+ }
29
+
30
+ INDUSTRY_MAP = {
31
+ "Finance": ["DPDP Act 2023", "RBI FREE‑AI Framework 2025", "IS 17428‑1 (Data Privacy Assurance)", "OECD AI Principles"],
32
+ "Health Care": ["DPDP Act 2023", "Responsible AI (NITI Aayog)", "ISO 23894 (AI Risk Mgmt)", "OECD AI Principles"],
33
+ "E‑Commerce": ["DPDP Act 2023", "IS 17428‑1 (Data Privacy Assurance)", "OECD AI Principles", "EU AI Act (Reg. 2024/1689)"],
34
+ "All": list(POLICY_URLS.keys()),
35
+ }
36
+
37
+ # ---------------------------------------------------------------------
38
+ # 2️⃣  Utility functions
39
+ # ---------------------------------------------------------------------
40
+
41
+ def download_file(url: str, path: pathlib.Path):
42
+ if path.exists():
43
+ return path
44
+ path.parent.mkdir(parents=True, exist_ok=True)
45
+ r = requests.get(url, timeout=60)
46
+ r.raise_for_status()
47
+ path.write_bytes(r.content)
48
+ return path
49
+
50
+
51
+ def extract_text_from_pdf(pdf_path: pathlib.Path) -> str:
52
+ text = []
53
+ with pdf_path.open("rb") as f:
54
+ reader = pypdf.PdfReader(f)
55
+ for page in reader.pages:
56
+ txt = page.extract_text() or ""
57
+ text.append(txt)
58
+ return "\n".join(text)
59
+
60
+
61
+ @lru_cache(maxsize=1)
62
+ def build_vector_store(selected_sources=tuple(POLICY_URLS.keys())):
63
+ print("⏳ Building vector store …")
64
+ documents = []
65
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
66
+
67
+ for name in selected_sources:
68
+ url = POLICY_URLS[name]
69
+ pdf_path = pathlib.Path(tempfile.gettempdir()) / "policygpt" / f"{name}.pdf"
70
+ try:
71
+ download_file(url, pdf_path)
72
+ raw_text = extract_text_from_pdf(pdf_path)
73
+ chunks = splitter.split_text(raw_text)
74
+ for chunk in chunks:
75
+ documents.append(Document(page_content=chunk, metadata={"source": name}))
76
+ print(f"✔ Loaded {name} ({len(chunks)} chunks)")
77
+ except Exception as e:
78
+ print(f"✖ Failed to process {name}: {e}")
79
+
80
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
81
+ store = FAISS.from_documents(documents, embedding=embeddings)
82
+ return store
83
+
84
+ # Light‑weight generative model for answers
85
+ qa_pipeline = pipeline(
86
+ "text-generation",
87
+ model="google/flan-t5-small",
88
+ max_length=256,
89
+ do_sample=False,
90
+ )
91
+
92
+ def rag_answer(question: str, industry: str = "All") -> str:
93
+ # Build / get the store
94
+ if industry == "All":
95
+ store = build_vector_store(tuple(POLICY_URLS.keys()))
96
+ else:
97
+ store = build_vector_store(tuple(INDUSTRY_MAP[industry]))
98
+
99
+ # Retrieve top‑k chunks
100
+ docs = store.similarity_search(question, k=4)
101
+ context = "\n\n".join([d.page_content for d in docs])
102
+
103
+ prompt = textwrap.dedent(f"""\
104
+ You are PolicyGPT, an assistant that answers queries about AI governance and data protection
105
+ using the CONTEXT below. Provide concise, actionable guidance (≤150 words) and cite the
106
+ policy source name in brackets. If the answer is not in context, say "I don’t know."
107
+
108
+ CONTEXT:
109
+ {context}
110
+
111
+ Question: {question}
112
+ Answer:
113
+ """)
114
+
115
+ try:
116
+ response = qa_pipeline(prompt, truncate=True)[0]["generated_text"]
117
+ except Exception as e:
118
+ response = f"Error generating answer: {e}\n{traceback.format_exc()}"
119
+ return response.strip()
120
+
121
+ # Very naive risk scoring
122
+ def compliance_score(answer: str) -> str:
123
+ answer_low = answer.lower()
124
+ if any(w in answer_low for w in ["prohibited", "penalty", "violation"]):
125
+ return "High"
126
+ if any(w in answer_low for w in ["must", "should", "shall"]):
127
+ return "Medium"
128
+ return "Low"
129
+
130
+ # ---------------------------------------------------------------------
131
+ # 3️⃣  Gradio UI
132
+ # ---------------------------------------------------------------------
133
+
134
+ def chat(question, industry):
135
+ answer = rag_answer(question, industry)
136
+ score = compliance_score(answer)
137
+ return answer, f"Estimated compliance risk: **{score}**"
138
+
139
+ with gr.Blocks(title="PolicyGPT 🇮🇳 (AI & Data Governance)") as demo:
140
+ gr.Markdown(
141
+ """
142
+ # PolicyGPT 🇮🇳
143
+ Ask anything about AI & Data Governance policies (DPDP Act, RBI FREE‑AI, ISO 42001, OECD, EU AI Act, etc.).
144
+ """)
145
+ with gr.Row():
146
+ industry_dd = gr.Dropdown(
147
+ choices=list(INDUSTRY_MAP.keys()),
148
+ label="Select your industry",
149
+ value="All",
150
+ )
151
+ user_input = gr.Textbox(label="Your question")
152
+ answer_out = gr.Markdown()
153
+ risk_out = gr.Markdown()
154
+
155
+ user_input.submit(chat, [user_input, industry_dd], [answer_out, risk_out])
156
+
157
+ if __name__ == "__main__":
158
+ demo.launch()