Install Langtang community using pip

#2
by Ready2make - opened
Nestle_HR_Assistant_LangChain_Shareable.ipynb ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "8980ecb3",
6
+ "metadata": {},
7
+ "source": [
8
+ "\n",
9
+ "# Nestlé HR Assistant (LangChain, Modern) — Shareable Gradio URL\n",
10
+ "\n",
11
+ "This notebook sets up a Retrieval-Augmented Generation (RAG) chatbot over your Nestlé HR policy PDF.\n",
12
+ "\n",
13
+ "**What you get**\n",
14
+ "- `%pip install` with correct modules (fixes `ModuleNotFoundError: langchain_community...`).\n",
15
+ "- PDF loading & chunking with `PyPDFLoader` + `RecursiveCharacterTextSplitter`.\n",
16
+ "- Embeddings + FAISS vector store using OpenAI (or change model as needed).\n",
17
+ "- `ChatOpenAI` + `RetrievalQA` chain that answers **only** from the policy.\n",
18
+ "- Gradio UI with `share=True` → **Public URL** for screenshots.\n",
19
+ "\n",
20
+ "> Run cells in order. Last cell prints the Public URL.\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "id": "73dc5e11",
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "\n",
31
+ "# Install or upgrade required libraries\n",
32
+ "%pip -q install -U langchain langchain-openai langchain-community faiss-cpu pypdf gradio tiktoken\n"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "f1008770",
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "\n",
43
+ "import os, glob, shutil, sys\n",
44
+ "\n",
45
+ "import langchain\n",
46
+ "import langchain_openai\n",
47
+ "import langchain_community\n",
48
+ "import gradio as gr\n",
49
+ "\n",
50
+ "from langchain_community.document_loaders import PyPDFLoader\n",
51
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
52
+ "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
53
+ "from langchain_community.vectorstores import FAISS\n",
54
+ "from langchain.prompts import PromptTemplate\n",
55
+ "from langchain.chains import RetrievalQA\n",
56
+ "\n",
57
+ "print(\"Python:\", sys.version)\n",
58
+ "print(\"langchain:\", langchain.__version__)\n",
59
+ "print(\"langchain-openai:\", getattr(langchain_openai, \"__version__\", \"n/a\"))\n",
60
+ "print(\"langchain-community:\", getattr(langchain_community, \"__version__\", \"n/a\"))\n",
61
+ "print(\"gradio:\", gr.__version__)\n"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "id": "e22527ac",
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "\n",
72
+ "# --- Configuration ---\n",
73
+ "# IMPORTANT: Set your OpenAI API key here or in the environment before running.\n",
74
+ "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\" # <-- uncomment and paste if needed\n",
75
+ "\n",
76
+ "MODEL_NAME = os.getenv(\"OPENAI_MODEL\", \"gpt-4o-mini\") # you can change to \"gpt-3.5-turbo\"\n",
77
+ "if not os.getenv(\"OPENAI_API_KEY\"):\n",
78
+ " raise SystemExit(\"Missing OPENAI_API_KEY. Set it above or in your environment and re-run.\")\n",
79
+ "\n",
80
+ "RETRIEVE_K = 5\n",
81
+ "CHUNK_SIZE = 900\n",
82
+ "CHUNK_OVERLAP = 150\n"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "id": "5d82453a",
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "\n",
93
+ "# Find your Nestlé HR policy PDF; prefers your uploaded filename.\n",
94
+ "candidates = [\n",
95
+ " \"1728286846_the_nestle_hr_policy_pdf_2012 (1).pdf\",\n",
96
+ " \"the_nestle_hr_policy_pdf_2012.pdf\",\n",
97
+ "]\n",
98
+ "\n",
99
+ "PDF_PATH = next((p for p in candidates if os.path.exists(p)), None)\n",
100
+ "\n",
101
+ "# If running where uploads live under /mnt/data, copy it locally\n",
102
+ "if PDF_PATH is None:\n",
103
+ " mnt = \"/mnt/data/1728286846_the_nestle_hr_policy_pdf_2012 (1).pdf\"\n",
104
+ " if os.path.exists(mnt):\n",
105
+ " try:\n",
106
+ " shutil.copy(mnt, os.path.basename(mnt))\n",
107
+ " PDF_PATH = os.path.basename(mnt)\n",
108
+ " except Exception:\n",
109
+ " PDF_PATH = mnt\n",
110
+ "\n",
111
+ "# Fallback: glob\n",
112
+ "if PDF_PATH is None:\n",
113
+ " hits = glob.glob(\"*nestle*hr*policy*.pdf\") + glob.glob(\"*HR*Policy*.pdf\")\n",
114
+ " PDF_PATH = hits[0] if hits else None\n",
115
+ "\n",
116
+ "if PDF_PATH is None or not os.path.exists(PDF_PATH):\n",
117
+ " raise SystemExit(\"Policy PDF not found. Put it next to this notebook or under /mnt/data/\")\n",
118
+ "else:\n",
119
+ " print(\"Using PDF:\", PDF_PATH)\n"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "id": "2d018f1b",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "\n",
130
+ "loader = PyPDFLoader(PDF_PATH)\n",
131
+ "pages = loader.load()\n",
132
+ "\n",
133
+ "splitter = RecursiveCharacterTextSplitter(\n",
134
+ " chunk_size=CHUNK_SIZE,\n",
135
+ " chunk_overlap=CHUNK_OVERLAP,\n",
136
+ " separators=[\"\\n\\n\", \"\\n\", \" \", \"\"],\n",
137
+ ")\n",
138
+ "docs = splitter.split_documents(pages)\n",
139
+ "print(f\"Loaded pages: {len(pages)} | Chunks: {len(docs)}\")\n"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "id": "f93d6b4c",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "\n",
150
+ "emb = OpenAIEmbeddings()\n",
151
+ "vs = FAISS.from_documents(docs, emb)\n",
152
+ "retriever = vs.as_retriever(search_kwargs={\"k\": RETRIEVE_K})\n",
153
+ "print(\"Vector store ready (FAISS).\")\n"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "id": "9dd28a74",
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "\n",
164
+ "system_rules = (\n",
165
+ " \"You are an assistant answering questions about the Nestlé HR Policy. \"\n",
166
+ " \"Use ONLY the provided context. If the answer is not present, say: \"\n",
167
+ " \"'I don’t know based on the provided policy.' Be concise and factual.\"\n",
168
+ ")\n",
169
+ "\n",
170
+ "prompt = PromptTemplate(\n",
171
+ " input_variables=[\"context\", \"question\"],\n",
172
+ " template=(\n",
173
+ " \"{rules}\\n\\n\"\n",
174
+ " \"Context:\\n{context}\\n\\n\"\n",
175
+ " \"Question: {question}\\n\\n\"\n",
176
+ " \"Answer:\"\n",
177
+ " ).format(rules=system_rules, context=\"{context}\", question=\"{question}\")\n",
178
+ ")\n",
179
+ "\n",
180
+ "llm = ChatOpenAI(model=MODEL_NAME, temperature=0)\n",
181
+ "qa = RetrievalQA.from_chain_type(\n",
182
+ " llm=llm,\n",
183
+ " chain_type=\"stuff\",\n",
184
+ " retriever=retriever,\n",
185
+ " chain_type_kwargs={\"prompt\": prompt, \"document_variable_name\": \"context\"},\n",
186
+ " return_source_documents=True,\n",
187
+ ")\n",
188
+ "\n",
189
+ "def format_sources(source_documents):\n",
190
+ " pages = []\n",
191
+ " for d in source_documents or []:\n",
192
+ " p = d.metadata.get(\"page\", None)\n",
193
+ " if isinstance(p, int):\n",
194
+ " pages.append(p + 1) # 1-based\n",
195
+ " if not pages:\n",
196
+ " return \"\"\n",
197
+ " uniq = sorted(set(pages))\n",
198
+ " return \"**Source:** Nestlé Human Resources Policy (pp. \" + \", \".join(map(str, uniq)) + \").\"\n",
199
+ "\n",
200
+ "def snippets(source_documents, max_chars=280):\n",
201
+ " out = []\n",
202
+ " for d in (source_documents or [])[:3]:\n",
203
+ " txt = (d.page_content or \"\").strip().replace(\"\\n\", \" \")\n",
204
+ " out.append(\"• \" + (txt[:max_chars] + (\"…\" if len(txt) > max_chars else \"\")))\n",
205
+ " return \"\\n\".join(out)\n",
206
+ "\n",
207
+ "def ask(query: str):\n",
208
+ " if not query.strip():\n",
209
+ " return \"Please enter a question.\", \"\"\n",
210
+ " res = qa({\"query\": query})\n",
211
+ " ans = (res.get(\"result\") or \"\").strip() or \"I don’t know based on the provided policy.\"\n",
212
+ " srcs = res.get(\"source_documents\") or []\n",
213
+ " return ans + (\"\\n\\n\" + format_sources(srcs) if srcs else \"\"), (\"**Top snippets:**\\n\" + snippets(srcs)) if srcs else \"\"\n"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "id": "4eaaa3c8",
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "\n",
224
+ "# Quick smoke test\n",
225
+ "test_q = \"What does the policy say about training and learning?\"\n",
226
+ "a, s = ask(test_q)\n",
227
+ "print(\"Q:\", test_q)\n",
228
+ "print(\"A:\", a[:500], \"...\" if len(a) > 500 else \"\")\n",
229
+ "print(s)\n"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": null,
235
+ "id": "7cf1a435",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "\n",
240
+ "# Gradio UI with share=True to get a Public URL for screenshots\n",
241
+ "dark_css = (\n",
242
+ " \".gradio-container {max-width: 900px !important}\\n\"\n",
243
+ " \"body {background: #0b0f14 !important}\\n\"\n",
244
+ " \".prose, .gr-markdown, label, .gr-button {color: #e6edf3 !important}\\n\"\n",
245
+ " \"textarea, input, .gr-box {background:#0f1620 !important; color:#e6edf3 !important}\\n\"\n",
246
+ ")\n",
247
+ "\n",
248
+ "with gr.Blocks(title=\"Nestlé HR Policy Assistant\", css=dark_css, theme=gr.themes.Soft()) as demo:\n",
249
+ " gr.Markdown(\"# Nestlé HR Policy Assistant (LangChain, Modern)\")\n",
250
+ " gr.Markdown(\"Ask questions grounded in the HR Policy PDF. Answers include page citations.\")\n",
251
+ " q = gr.Textbox(label=\"Your question\", placeholder=\"Type your question and press Enter\")\n",
252
+ " ask_btn = gr.Button(\"Ask\")\n",
253
+ " ans = gr.Markdown(label=\"Answer\")\n",
254
+ " snips = gr.Markdown(label=\"Sources (snippets)\")\n",
255
+ " with gr.Row():\n",
256
+ " for ex in [\n",
257
+ " \"What is Nestlé’s approach to Total Rewards?\",\n",
258
+ " \"How does Nestlé handle hiring decisions?\",\n",
259
+ " \"What does the policy say about training and learning?\",\n",
260
+ " \"How are performance and promotions managed?\",\n",
261
+ " ]:\n",
262
+ " gr.Button(ex).click(lambda x=ex: x, outputs=q)\n",
263
+ " ask_btn.click(ask, inputs=q, outputs=[ans, snips])\n",
264
+ " q.submit(ask, inputs=q, outputs=[ans, snips])\n",
265
+ "\n",
266
+ "print(\"Launching... The following includes a Public URL you can open for screenshots.\")\n",
267
+ "demo.launch(share=True, server_name=\"0.0.0.0\")\n"
268
+ ]
269
+ }
270
+ ],
271
+ "metadata": {},
272
+ "nbformat": 4,
273
+ "nbformat_minor": 5
274
+ }