PranavReddy18 commited on
Commit
b41fa31
·
verified ·
1 Parent(s): 267642f

Upload 22 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ local_faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV PYTHONPATH=/app/src
6
+
7
+ WORKDIR /app
8
+
9
+ RUN apt-get update && apt-get install -y \
10
+ build-essential \
11
+ libglib2.0-0 \
12
+ libsm6 \
13
+ libxext6 \
14
+ libxrender-dev \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ COPY . .
18
+
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ EXPOSE 8000
22
+
23
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
data/info.txt ADDED
@@ -0,0 +1,1289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 10th class Marks
2
+
3
+ **Board of Secondary Education
4
+ Telangana State, India**
5
+
6
+ **SECONDARY SCHOOL CERTIFICATE**
7
+ **REGULAR** PC/29/4222/04/256517/3
8
+ **TS-EE 524495**
9
+
10
+ ---
11
+
12
+ **CERTIFIED THAT**
13
+ **KATTA SAI PRANAV REDDY**
14
+ **Father's Name:** KATTA SRINIVAS REDDY
15
+ **Mother's Name:** KATTA UMARANI
16
+ **Roll No.:** 1929100642
17
+ **Date of Birth:** 03/06/2003 (Zero Three June Two Zero Zero Three)
18
+ **School:** EKALAVYA FOUNDATION SCL NALGONDA, NALGONDA DISTRICT
19
+ **Medium:** ENGLISH
20
+
21
+ Has appeared and **PASSED SSC EXAMINATION** held in **MARCH–2019**
22
+
23
+
24
+ ### **The Candidate Secured the Following Grade and Grade Points in Curricular Areas:**
25
+
26
+ | Subject | Grade FA | Grade SA | Overall Grade | Grade Point |
27
+ | ------------------------ | -------- | -------- | ------------- | ----------- |
28
+ | First Language (TELUGU) | A1 | A1 | A1 | 10 |
29
+ | Third Language (ENGLISH) | A1 | A2 | A1 | 10 |
30
+ | Mathematics | A1 | A1 | A1 | 10 |
31
+ | Science | A1 | A2 | A1 | 09 |
32
+ | Social Studies | A1 | A1 | A1 | 10 |
33
+ | Second Language (HINDI) | A1 | B1 | B1 | 08 |
34
+
35
+
36
+ ### **Cumulative Grade Point Average (CGPA): 9.5**
37
+
38
+
39
+ ### **CO-CURRICULAR AREAS:**
40
+
41
+ | Subject | Grade |
42
+ | ----------------------------- | ----- |
43
+ | Value Education & Life Skills | A+ |
44
+ | Art & Cultural Education | A+ |
45
+ | Work & Computer Education | A+ |
46
+ | Physical & Health Education | A+ |
47
+
48
+
49
+ **Head Master**
50
+ **EKALAVYA FOUNDATION SCHOOL**
51
+ **Nalgonda, Nalgonda**
52
+ **Date of Issue: 13th May, 2019**
53
+
54
+ **SECRETARY**
55
+ **Board of Secondary Education**
56
+ **Telangana State, Hyderabad**
57
+
58
+ ---
59
+
60
+ 12th class marks(Intermediate)
61
+ ---
62
+
63
+ **Telangana State Board of Intermediate Education**
64
+ Vidya Bhavan, Nampally, Hyderabad - 500 001
65
+
66
+ **PASS CERTIFICATE-CUM-MEMORANDUM OF MARKS**
67
+ **This is to certify that**
68
+ **KATTA SAI PRANAV REDDY**
69
+ **Father’s Name:** KATTA SRINIVAS REDDY
70
+ **Mother’s Name:** KATTA UMARANI
71
+ **Registered Number:** 2158208799
72
+ **Month & Year of Exam:** MARCH 2021
73
+ **Medium:** ENGLISH
74
+ **Result:** A GRADE
75
+ has appeared for the Intermediate Public Examination held in March 2021 and passed in the following subjects:
76
+
77
+
78
+ ### **Part - I**
79
+
80
+ **ENGLISH** – 98 / 100
81
+ **SANSKRIT** – 100 / 100
82
+
83
+
84
+ ### **Part - II**
85
+
86
+ **HE** – 98 / 100
87
+
88
+ ---
89
+
90
+ ### **Part - III: Optional Subjects**
91
+
92
+ | Subject | Max Marks | Marks Obtained |
93
+ | -------------------- | --------- | -------------- |
94
+ | MATHEMATICS A | 75 | 75 |
95
+ | MATHEMATICS B | 75 | 75 |
96
+ | PHYSICS | 60 | 58 |
97
+ | PHYSICS PRACTICALS | 30 | 30 |
98
+ | CHEMISTRY | 60 | 60 |
99
+ | CHEMISTRY PRACTICALS | 30 | 30 |
100
+
101
+ ---
102
+
103
+ ### **Environmental Education:** QUALIFIED
104
+
105
+ ### **Ethics and Human Values:** QUALIFIED
106
+
107
+ ---
108
+
109
+ **Total Marks:**
110
+ In Figures: **982**
111
+ In Words: **NINE EIGHT TWO**
112
+
113
+ ---
114
+
115
+ **Date:** 28-06-2021
116
+ **Signature of the Principal and College Seal:** Sri Chaitanya Jr. Kalasala
117
+ **Signature:** (Controller of Examinations)
118
+
119
+ ---
120
+
121
+ Resume
122
+ ---
123
+ **Katta Sai Pranav Reddy**
124
+ Email: [kattapranavreddy@gmail.com](mailto:kattapranavreddy@gmail.com)
125
+ GitHub: ka1817
126
+ LinkedIn: pranav-reddy-katta
127
+
128
+ ---
129
+
130
+ ### **Professional Summary**
131
+
132
+ AI and ML Engineer skilled in developing end-to-end machine learning and Generative AI solutions for real-world business challenges. Proficient in data preprocessing, exploratory data analysis, and building predictive models to deliver actionable insights. Experienced in leveraging advanced AI techniques and data-driven strategies to create scalable, impactful solutions.
133
+
134
+ ---
135
+
136
+ ### **Education**
137
+
138
+ * **Anurag University**, Hyderabad, India
139
+ B.Tech in Artificial Intelligence and Machine Learning; CGPA: 8.29
140
+ *09/2021 – 04/2025*
141
+
142
+ * **Sri Chaitanya Junior College**, Hyderabad, India
143
+ MPC (Maths, Physics, Chemistry); Percentage: 98%
144
+ *06/2019 – 05/2021*
145
+
146
+ ---
147
+
148
+ ### Experience
149
+
150
+ **iNeuron Intelligence Pvt. Ltd.** *(Remote)*
151
+ *Machine Learning Intern — 10/2024 – 11/2024*
152
+
153
+ * Conducted extensive data preprocessing and exploratory data analysis (EDA) on large customer datasets to identify key behavioral patterns and high-value customer segments.
154
+ * Developed and trained machine learning models for customer segmentation using clustering techniques such as K-Means and Hierarchical Clustering, enhancing marketing strategy alignment.
155
+ * Collaborated with cross-functional teams to interpret analytical insights and monitored model performance across different stages of the pipeline, ensuring accuracy and consistency.
156
+ * Delivered actionable recommendations based on statistical analysis and predictive modeling, supporting data-driven decision-making for targeted marketing campaigns.
157
+
158
+ **Unified Mentor Pvt. Ltd.** *(Remote)*
159
+ *Data Science Intern — 09/2024 – 10/2024*
160
+
161
+ * Developed and optimized machine learning models to predict employee attrition, enabling proactive retention strategies and improving workforce stability.
162
+ * Conducted comprehensive data preprocessing, feature engineering, and exploratory data analysis (EDA) to identify key factors influencing employee turnover.
163
+ * Delivered actionable insights and visualized patterns through dashboards and reports, supporting HR teams in making data-driven decisions.
164
+ * Presented findings to stakeholders, translating complex analytics into clear, strategic recommendations for reducing attrition risk.
165
+
166
+ ---
167
+
168
+ ### **Projects**
169
+
170
+ **BigBasket SmartCart – AI-Driven Shopping Assistant** *(06/2025 – 07/2025)* \[GitHub]
171
+
172
+ * Led development of an AI-driven shopping assistant using RAG, enabling natural language queries and semantic product search with 95% retrieval accuracy for real-time product recommendations.
173
+ * Developed a retrieval pipeline using the gte-small model, FAISS indexing, and Cross-Encoder reranking, which improved relevance score to 0.89 for intent-driven search results.
174
+ * Designed a modular architecture with FastAPI, HTML/CSS, and Docker, ensuring scalability and reduced response latency to \~2 seconds for seamless interactions.
175
+ * Implemented GitHub Actions for automated testing, Docker builds, and AWS EC2 deployment, which reduced deployment time by 40% and improved system reliability.
176
+ 🛒 BigBasket SmartCart – AI Assistant for BigBasket Shopping
177
+ 🧾 Introduction
178
+ The rapid evolution of AI technologies has created new opportunities for enhancing user experience in digital commerce. Leveraging state-of-the-art language models and retrieval systems, intelligent assistants can now understand complex queries, process vast amounts of product data, and deliver precise, context-aware responses. This project presents a scalable and robust AI-powered shopping assistant tailored for BigBasket's product ecosystem. Built using Retrieval-Augmented Generation (RAG), vector embeddings, and large language models (LLMs), the system enables efficient and intelligent product discovery through natural language interaction.
179
+
180
+ ❗ Problem Statement
181
+ Online shoppers frequently seek personalized and context-specific product recommendations, such as identifying the best-rated skincare item at the lowest price. However, conventional search systems often fall short in understanding such nuanced queries, lacking the ability to interpret intent, compare attributes across products, and deliver concise, relevant results. This creates friction in the user journey, leading to suboptimal shopping experiences. There is a clear need for an intelligent assistant that can process natural language queries, reason over structured product data, and deliver accurate, insightful responses to aid decision-making.
182
+
183
+ Business Goal:
184
+ To enhance the shopping experience, boost conversion rates, and optimize search efficiency by enabling natural language-based product search that understands user intent and delivers context-aware, personalized recommendations.
185
+
186
+ 💰 Business Impact (Revenue + Cost)
187
+ 💸 1. Increased Conversion Rates (↑ Revenue)
188
+
189
+ • Users find relevant products faster, leading to more product views, cart adds, and purchases
190
+
191
+ • Personalized recommendations match buyer intent better than traditional search
192
+
193
+ • Better UX = lower drop-off rates
194
+ 📈 Even a 1–2% uplift in conversions from improved product search can lead to significant revenue gains for a large marketplace like BigBasket.
195
+
196
+ 📉 2. Reduced Customer Support Queries (↓ Cost)
197
+
198
+ • AI assistant can handle informational and product-related queries
199
+
200
+ • Reduces manual intervention, live chat support, and email volume
201
+
202
+ • More self-service = less operational overhead
203
+ ⏱️ 3. Reduced Time-to-Purchase (↑ Efficiency)
204
+
205
+ • Customers make faster decisions because the assistant summarizes comparisons (e.g., price vs. rating trade-offs)
206
+
207
+ • This shortens the purchase journey and increases user satisfaction
208
+ 🧪 4. Rapid Experimentation & Deployment (↓ Dev Costs)
209
+
210
+ • The project is modular, Dockerized, and CI/CD enabled → easier to iterate and deploy
211
+
212
+ • Can be extended to other verticals (electronics, fashion) or other marketplaces with minimal changes
213
+ 🚀 Features
214
+ 🔍 Natural Language Product Search Users can ask queries like "cheapest skin care with highest rating" or "best perfume under ₹500".
215
+
216
+ 🧠 Query Rewriting with LLM Uses Groq LLMs (gemma2-9b-it) to refine user queries for more precise retrieval.
217
+
218
+ 📄 Document Embedding & Vector Search Preprocessed BigBasket product data embedded with thenlper/gte-small and indexed using FAISS.
219
+
220
+ 🤖 RAG Pipeline Uses llama3-70b-8192 model for final answer generation based on retrieved and reranked results.
221
+
222
+ 🔁 Reranking with CrossEncoder Improves accuracy using cross-encoder/ms-marco-MiniLM-L-6-v2.
223
+
224
+ 🌐 FastAPI Backend Easily accessible via localhost:8000 or deployed server.
225
+
226
+ 🐳 Dockerized Build once, run anywhere. Fully containerized using Docker.
227
+
228
+ 🚰 CI/CD with GitHub Actions Automated testing, image build, and push to DockerHub.
229
+
230
+ 📜 Logging Logging implemented for each step in the pipeline for transparency and debugging.
231
+
232
+ 🗂️ Folder Structure
233
+ BIGBASKET/
234
+ ├── .github/
235
+ │ └── workflows/
236
+ │ └── ci-cd.yml
237
+ ├── data/
238
+ │ └── BigBasket Products.csv
239
+ ├── logs/
240
+ │ ├── data_ingestion.log
241
+ │ ├── data_preprocessing.log
242
+ │ ├── query_rewriting.log
243
+ │ └── retrieval_generation.log
244
+ ├── src/
245
+ │ ├── utils/
246
+ │ │ └── logger.py
247
+ │ ├── __init__.py
248
+ │ ├── data_ingestion.py
249
+ │ ├── data_preprocessing.py
250
+ │ ├── query_rewritting.py
251
+ │ └── retrival_genaration.py
252
+ ├── static/
253
+ │ └── css/
254
+ │ └── style.css
255
+ ├── templates/
256
+ │ └── index.html
257
+ ├── tests/
258
+ ├── ui/
259
+ ├── main.py
260
+ ├── Dockerfile
261
+ ├── requirements.txt
262
+ ├── .env
263
+ ├── .dockerignore
264
+ ├── .gitignore
265
+ └── README.md
266
+ 🧪 Local Development Setup
267
+ # Clone the repository
268
+ git clone https://github.com/ka1817/BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping
269
+ cd BigBasket
270
+
271
+ # Create virtual environment
272
+ python -m venv venv
273
+ source venv/bin/activate # On Windows: venv\Scripts\activate
274
+
275
+ # Install dependencies
276
+ pip install -r requirements.txt
277
+
278
+ # Before Running the app set .env(environment variable GROQ_API_KEY)
279
+ uvicorn main:app --reload --port 8000
280
+ 🐳 Docker Instructions
281
+ 🔧 1. Pull Image
282
+
283
+ docker pull pranavreddy123/bigbasket-assistant:latest
284
+ 🚀 2. Run the App (Detached Mode)
285
+
286
+ docker run -d -p 8000:8000 \
287
+ -e GROQ_API_KEY=create groq api from groq cloud \
288
+ pranavreddy123/bigbasket-assistant:latest
289
+ 🌐 3. Access the App
290
+
291
+ http://localhost:8000
292
+ 🤖 Example Usage
293
+
294
+ Query: "Which is the cheapest hair product with high rating?" Rewritten: "Find the most affordable hair care product with a high customer rating." Response: "Garlic Oil - Vegetarian Capsule 500 mg by Sri Sri Ayurveda is available at ₹220 with a 4.1 rating."
295
+
296
+ 🛠️ GitHub Actions (CI/CD)
297
+ File: .github/workflows/ci-cd.yml
298
+
299
+ ✅ CI-Test: Runs unit tests using pytest.
300
+
301
+ 🐳 CD-Docker: Builds Docker image and pushes to DockerHub.
302
+
303
+ Triggered on push to main or pull request.
304
+
305
+ ☁️ Deployment on Amazon EC2
306
+ 1. Launch EC2 Instance (Ubuntu 20.04)
307
+ 2. SSH into your instance
308
+ ssh -i "your-key.pem" ubuntu@your-ec2-ip
309
+ 3. Install Docker
310
+ sudo apt update
311
+ sudo apt install docker.io -y
312
+ sudo systemctl start docker
313
+ sudo systemctl enable docker
314
+ 4. Pull and Run Docker Image
315
+ docker pull pranavreddy123/bigbasket-assistant:latest
316
+ # Ensure your .env file is in the same directory, or create an API key using Groq Cloud and add it to the .env file
317
+ docker run -d --env-file .env -p 8000:8000 pranavreddy123/bigbasket-assistant:latest
318
+ Access your app via http://<your-ec2-public-ip>
319
+ 🧠 Tech Stack
320
+ ✅ LLMs: Groq (gemma2-9b-it, llama3-70b-8192)
321
+
322
+ ✅ LangChain, FAISS, HuggingFace, CrossEncoder
323
+
324
+ ✅ FastAPI
325
+
326
+ ✅ Docker
327
+
328
+ ✅ GitHub Actions
329
+
330
+ ✅ AWS EC2
331
+
332
+ ✅ HTML/CSS
333
+
334
+ 🔗 Links
335
+ 🔍 GitHub Repo: BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping
336
+
337
+ 🐳 DockerHub: pranavreddy123/bigbasket-assistant
338
+
339
+ 🧑‍💻 Developed By
340
+ Pranav Reddy
341
+
342
+ **Netflix Customer Churn Prediction – End-to-End ML System** *(Personal Project)* \[GitHub]
343
+
344
+ * Developed a complete machine learning pipeline to predict customer churn, achieving 99% recall and 0.99 ROC AUC through feature engineering, hyperparameter tuning, and cross-validation.
345
+ * Performed in-depth EDA to identify key churn drivers such as low engagement, infrequent logins, and payment methods, improving model interpretability and business insights.
346
+ * Implemented reproducible MLOps workflows with data versioning using DVC and AWS S3, and tracked experiments, metrics, and model artifacts using MLflow.
347
+ * Designed and deployed a FastAPI-based REST API with HTML/CSS frontend for real-time predictions, containerized the application using Docker, and automated CI/CD using GitHub Actions for deployment on AWS EC2.
348
+
349
+ 📊 Netflix Customer Churn Prediction
350
+ 🚀 Project Overview
351
+ Netflix, like many subscription-based platforms, faces the challenge of customer churn. Retaining existing customers is significantly more cost-effective than acquiring new ones. This project delivers a full-scale machine learning solution to predict customer churn using behavioral and subscription data, from ingestion to deployment via a FastAPI interface.
352
+
353
+ This repository presents a production-grade, explainable, and reproducible ML pipeline with CI/CD, experiment tracking (MLflow), data versioning (DVC), and containerized deployment using Docker.
354
+
355
+ 🎯 Problem Statement
356
+ Netflix seeks to proactively identify users likely to cancel their subscriptions. Predicting churn enables targeted interventions to retain users and minimize revenue loss.
357
+
358
+ Goal: Build an ML classification model that predicts churn based on customer behavior and plan details.
359
+
360
+ 📌 Key Features Used
361
+ Feature Type Description
362
+ watch_hours Numerical Total hours watched
363
+ last_login_days Numerical Days since last login
364
+ number_of_profiles Numerical Total profiles under the account
365
+ avg_watch_time_per_day Numerical Daily average watch time
366
+ subscription_type Categorical Subscription level: Basic, Standard, Premium
367
+ payment_method Categorical Payment method: Credit Card, UPI, PayPal, etc.
368
+ churned Target 1 = Churned, 0 = Not churned
369
+ 📊 Key EDA Insights
370
+ 🔬 Feature Significance
371
+ Feature Test p-value Significant?
372
+ subscription_type Chi-Square 0.0000 ✅ Yes
373
+ payment_method Chi-Square 0.0000 ✅ Yes
374
+ number_of_profiles Chi-Square 0.0000 ✅ Yes
375
+ watch_hours Mann-Whitney U 0.0000 ✅ Yes
376
+ last_login_days Mann-Whitney U 0.0000 ✅ Yes
377
+ avg_watch_time_per_day Mann-Whitney U 0.0000 ✅ Yes
378
+ age Mann-Whitney U 0.7803 ❌ No
379
+ gender, region, device Chi-Square > 0.3 ❌ No
380
+ ✅ These statistically significant features were included in the final model pipeline.
381
+
382
+ 🏗️ Project Architecture
383
+ netflix-churn-prediction/
384
+ ├── data/ # Raw and processed data
385
+ ├── models/ # Trained model binaries
386
+ ├── reports/ # Classification reports & plots
387
+ ├── static/ # CSS
388
+ ├── templates/ # HTML UI
389
+ ├── src/
390
+ │ ├── data_ingestion.py # Load dataset
391
+ │ ├── data_preprocessing.py # Pipeline for scaling & encoding
392
+ │ └── model_training.py # ML training & evaluation
393
+ ├── main.py # FastAPI backend
394
+ ├── Dockerfile # Containerization
395
+ ├── .dvc/ # DVC for data version control
396
+ ├── .github/workflows/ # CI/CD GitHub Actions
397
+ └── README.md
398
+ ⚙️ End-to-End ML Workflow
399
+ 1️⃣ Data Ingestion
400
+ Loads .csv into DataFrame
401
+ Handles errors and logs shape/summary
402
+ 2️⃣ Preprocessing
403
+ OneHotEncoding (categorical)
404
+ StandardScaler (numerical)
405
+ Uses ColumnTransformer for pipeline modularity
406
+ 3️⃣ Model Training
407
+ Models: RandomForest, GradientBoosting, SVC
408
+ GridSearchCV for hyperparameter tuning
409
+ Model artifacts saved to models/
410
+ ROC curves + classification reports saved to reports/
411
+ 4️⃣ MLflow Tracking ✅
412
+ Tracks experiment metadata, metrics, parameters
413
+ Stores models and artifacts
414
+ UI accessible at localhost:5000
415
+ 🧪 Model Performance
416
+ Model Accuracy F1 Score ROC AUC (Test) ROC AUC (CV) Notes
417
+ Random Forest 0.99 0.99 0.9995 0.9987 ✅ Best overall【13†source】
418
+ Gradient Boosting 0.99 0.99 0.9989 0.9991 Robust & efficient【12†source】
419
+ SVC 0.93 0.93 0.9844 0.9822 Lightweight【14†source】
420
+ 🌐 FastAPI Deployment
421
+ 🔧 API Endpoints:
422
+ /: HTML frontend form for manual input
423
+ /api/predict: JSON-based API for programmatic inference
424
+ 🔌 Model Used:
425
+ Random Forest (best AUC + accuracy)
426
+ Accepts form or JSON input
427
+ Returns churn prediction + confidence
428
+ 🐳 Docker Setup
429
+ FROM python:3.10-slim
430
+ WORKDIR /app
431
+ COPY requirements.txt .
432
+ RUN pip install -r requirements.txt
433
+ COPY . .
434
+ EXPOSE 8000
435
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
436
+ Run locally:
437
+
438
+ docker build -t netflix-churn .
439
+ docker run -p 8000:8000 netflix-churn
440
+ 🔁 CI/CD Pipeline (GitHub Actions)
441
+ ✅ Stages:
442
+ Test Phase
443
+
444
+ Install dependencies
445
+ Run pytest on unit tests
446
+ Pull versioned data using dvc pull
447
+ Build Phase
448
+
449
+ Docker image build with CACHEBUST arg
450
+ Push to DockerHub using GitHub Secrets
451
+ Deploy Phase
452
+
453
+ SSH into EC2 instance
454
+ Stop, remove old container
455
+ Pull and launch updated Docker image
456
+ 🔐 GitHub Repository Secrets
457
+ Name Purpose
458
+ AWS_ACCESS_KEY_ID AWS auth for DVC S3
459
+ AWS_SECRET_ACCESS_KEY AWS auth for DVC S3
460
+ DOCKER_USERNAME DockerHub username for push
461
+ DOCKER_PASSWORD DockerHub password/token
462
+ EC2_HOST Public IP/DNS of EC2 instance
463
+ EC2_USER SSH user for EC2 login
464
+ EC2_SSH_KEY Private SSH key for GitHub Actions
465
+ 🧬 Data Versioning with DVC
466
+ Tracks raw and preprocessed data versions
467
+ Uses .dvc/config to connect to AWS S3 remote
468
+ Run dvc push and dvc pull to sync across environments
469
+ Ensures reproducibility in CI and local experiments
470
+ 📌 Business Value & Insights
471
+ 🧠 High-risk churn users are linked to:
472
+
473
+ Low engagement (low watch hours)
474
+ Infrequent logins
475
+ Basic plans & non-card payments
476
+ 📈 Operational Benefits:
477
+
478
+ Preemptive retention campaigns
479
+ Personalized offers to vulnerable users
480
+ Reduce marketing costs via targeted outreach
481
+ ✅ Run Locally (No Docker)
482
+ git clone <repo_url>
483
+ cd netflix-churn-prediction
484
+ python src/model_training.py # Train all models
485
+ uvicorn main:app --reload # Launch API server
486
+ Summary
487
+ Component Implemented Tool/Service Used
488
+ Data Versioning ✅ DVC with AWS S3 remote
489
+ Data Ingestion ✅ pandas, custom Python class
490
+ Data Preprocessing ✅ scikit-learn Pipelines
491
+ Model Training ✅ scikit-learn, GridSearchCV
492
+ Experiment Tracking ✅ MLflow (local server: 127.0.0.1:5000)
493
+ Model Evaluation ✅ classification_report, ROC AUC
494
+ Model Packaging ✅ joblib for serialization
495
+ API Deployment ✅ FastAPI on AWS EC2
496
+ Web UI ✅ HTML + Bootstrap via Jinja2
497
+ Containerization ✅ Docker (with Dockerfile)
498
+ CI/CD Pipeline ✅ GitHub Actions
499
+ Cloud Hosting ✅ AWS EC2, SSH-based deployment
500
+ Secrets Management ✅ GitHub Secrets
501
+ Testing ✅ pytest, CI-tested
502
+ 🙌 Author
503
+ 👨‍💻 Katta Sai Pranav Reddy
504
+ 📎 Tech Stack
505
+ Python 3.10
506
+ Scikit-learn, MLflow, DVC, FastAPI, Docker
507
+ GitHub Actions, AWS EC2, S3 Remote Storage
508
+ ---
509
+
510
+ ### **Skills**
511
+
512
+ * **Tools:** MLflow, DVC, Docker, Git, GitHub Actions, AWS (EC2, S3, ECR), FAISS, Pinecone, Hugging Face, LangChain, LangSmith, FastAPI
513
+ * **Programming & Technical Skills:** Python, SQL, HTML, CSS, Scikit-learn, TensorFlow, Keras, Statistics
514
+ * **Data Science & Machine Learning:** Data Preprocessing, EDA, Feature Engineering, Model Training & Evaluation, Hyperparameter Tuning, Clustering, MLOps, Semantic Search, Retrieval-Augmented Generation (RAG), CNN, RNN, GPT, Transformers, Fine-Tuning, Prompt Engineering
515
+ * **Data Visualization & Analysis:** Pandas, NumPy, Matplotlib, Seaborn
516
+
517
+ ---
518
+
519
+ hobbies section
520
+
521
+ ---
522
+
523
+ ### **Hobbies & Interests**
524
+ Hobbies & Interests
525
+
526
+ * Playing Cricket
527
+ * Watching Football
528
+ * Reading Books
529
+ * Exploring Latest Advancements in Artificial Intelligence
530
+ * Browsing the Internet for Tech & Knowledge Updates
531
+
532
+ ---
533
+
534
+ ### Contact Information
535
+
536
+ Contact Information
537
+
538
+ 📞 **Phone:** +91 93475 41040
539
+ 📧 **Email:** [kattapranavreddy@gmail.com](mailto:kattapranavreddy@gmail.com)
540
+ 💻 **GitHub:** [github.com/ka1817](https://github.com/ka1817)
541
+ 🔗 **LinkedIn:** [linkedin.com/in/pranav-reddy-katta](https://www.linkedin.com/in/pranav-reddy-katta/)
542
+ ---
543
+
544
+ 10th
545
+ ---
546
+
547
+ **Board of Secondary Education
548
+ Telangana State, India**
549
+
550
+ **SECONDARY SCHOOL CERTIFICATE**
551
+ **REGULAR** PC/29/4222/04/256517/3
552
+ **TS-EE 524495**
553
+
554
+ ---
555
+
556
+ **CERTIFIED THAT**
557
+ **KATTA SAI PRANAV REDDY**
558
+ **Father's Name:** KATTA SRINIVAS REDDY
559
+ **Mother's Name:** KATTA UMARANI
560
+ **Roll No.:** 1929100642
561
+ **Date of Birth:** 03/06/2003 (Zero Three June Two Zero Zero Three)
562
+ **School:** EKALAVYA FOUNDATION SCL NALGONDA, NALGONDA DISTRICT
563
+ **Medium:** ENGLISH
564
+
565
+ Has appeared and **PASSED SSC EXAMINATION** held in **MARCH–2019**
566
+
567
+ ---
568
+
569
+ ### **The Candidate Secured the Following Grade and Grade Points in Curricular Areas:**
570
+
571
+ | Subject | Grade FA | Grade SA | Overall Grade | Grade Point |
572
+ | ------------------------ | -------- | -------- | ------------- | ----------- |
573
+ | First Language (TELUGU) | A1 | A1 | A1 | 10 |
574
+ | Third Language (ENGLISH) | A1 | A2 | A1 | 10 |
575
+ | Mathematics | A1 | A1 | A1 | 10 |
576
+ | Science | A1 | A2 | A1 | 09 |
577
+ | Social Studies | A1 | A1 | A1 | 10 |
578
+ | Second Language (HINDI) | A1 | B1 | B1 | 08 |
579
+
580
+ ---
581
+
582
+ ### **Cumulative Grade Point Average (CGPA): 9.5**
583
+
584
+ ---
585
+
586
+ ### **CO-CURRICULAR AREAS:**
587
+
588
+ | Subject | Grade |
589
+ | ----------------------------- | ----- |
590
+ | Value Education & Life Skills | A+ |
591
+ | Art & Cultural Education | A+ |
592
+ | Work & Computer Education | A+ |
593
+ | Physical & Health Education | A+ |
594
+
595
+ ---
596
+
597
+ ### **Marks of Identification:**
598
+
599
+ 1. A MOLE ON THE LEFT HAND RING FINGER
600
+ 2. A MOLE ON THE RIGHT ELBOW
601
+
602
+ ---
603
+
604
+ **Head Master**
605
+ **EKALAVYA FOUNDATION SCHOOL**
606
+ **Nalgonda, Nalgonda**
607
+ **Date of Issue: 13th May, 2019**
608
+
609
+ ---
610
+
611
+ **SECRETARY**
612
+ **Board of Secondary Education**
613
+ **Telangana State, Hyderabad**
614
+
615
+ **Aadhaar No.:** 774291627518
616
+
617
+ ---
618
+
619
+ 12th(Intermediate)
620
+ ---
621
+
622
+ **Telangana State Board of Intermediate Education**
623
+ Vidya Bhavan, Nampally, Hyderabad - 500 001
624
+
625
+ **PASS CERTIFICATE-CUM-MEMORANDUM OF MARKS**
626
+ **This is to certify that**
627
+ **KATTA SAI PRANAV REDDY**
628
+ **Father’s Name:** KATTA SRINIVAS REDDY
629
+ **Mother’s Name:** KATTA UMARANI
630
+ **Registered Number:** 2158208799
631
+ **Month & Year of Exam:** MARCH 2021
632
+ **Medium:** ENGLISH
633
+ **Result:** A GRADE
634
+ has appeared for the Intermediate Public Examination held in March 2021 and passed in the following subjects:
635
+
636
+ ---
637
+
638
+ ### **Part - I**
639
+
640
+ **ENGLISH** – 98 / 100
641
+ **SANSKRIT** – 100 / 100
642
+
643
+ ---
644
+
645
+ ### **Part - II**
646
+
647
+ **HE** – 98 / 100
648
+
649
+ ---
650
+
651
+ ### **Part - III: Optional Subjects**
652
+
653
+ | Subject | Max Marks | Marks Obtained |
654
+ | -------------------- | --------- | -------------- |
655
+ | MATHEMATICS A | 75 | 75 |
656
+ | MATHEMATICS B | 75 | 75 |
657
+ | PHYSICS | 60 | 58 |
658
+ | PHYSICS PRACTICALS | 30 | 30 |
659
+ | CHEMISTRY | 60 | 60 |
660
+ | CHEMISTRY PRACTICALS | 30 | 30 |
661
+
662
+ ---
663
+
664
+ ### **Environmental Education:** QUALIFIED
665
+
666
+ ### **Ethics and Human Values:** QUALIFIED
667
+
668
+ ---
669
+
670
+ **Total Marks:**
671
+ In Figures: **982**
672
+ In Words: **NINE EIGHT TWO**
673
+
674
+ ---
675
+
676
+ **Date:** 28-06-2021
677
+ **Signature of the Principal and College Seal:** Sri Chaitanya Jr. Kalasala
678
+ **Signature:** (Controller of Examinations)
679
+
680
+ ---
681
+
682
+ Resume
683
+ ---
684
+ **Katta Sai Pranav Reddy**
685
+ Email: [kattapranavreddy@gmail.com](mailto:kattapranavreddy@gmail.com)
686
+ GitHub: ka1817
687
+ LinkedIn: pranav-reddy-katta
688
+
689
+ ---
690
+
691
+ ### **Professional Summary**
692
+
693
+ AI and ML Engineer skilled in developing end-to-end machine learning and Generative AI solutions for real-world business challenges. Proficient in data preprocessing, exploratory data analysis, and building predictive models to deliver actionable insights. Experienced in leveraging advanced AI techniques and data-driven strategies to create scalable, impactful solutions.
694
+
695
+ ---
696
+
697
+ # Education
698
+
699
+ * **Anurag University**, Hyderabad, India
700
+ B.Tech in Artificial Intelligence and Machine Learning; CGPA: 8.29
701
+ *09/2021 – 04/2025*
702
+
703
+ * **Sri Chaitanya Junior College**, Hyderabad, India
704
+ MPC (Maths, Physics, Chemistry); Percentage: 98%
705
+ *06/2019 – 05/2021*
706
+
707
+ -----------------------------------------
708
+
709
+ ### Pranav Work Experience
710
+
711
+ Work Experience
712
+
713
+ iNeuron Intelligence Pvt. Ltd.(Remote)
714
+ Machine Learning Intern — 10/2024 – 11/2024
715
+
716
+ 1.Conducted extensive data preprocessing and exploratory data analysis (EDA) on large customer datasets to identify key behavioral patterns and high-value customer segments.
717
+ 2.Developed and trained machine learning models for customer segmentation using clustering techniques such as K-Means and Hierarchical Clustering, enhancing marketing strategy alignment.
718
+ 3.Collaborated with cross-functional teams to interpret analytical insights and monitored model performance across different stages of the pipeline, ensuring accuracy and consistency.
719
+ 4.Delivered actionable recommendations based on statistical analysis and predictive modeling, supporting data-driven decision-making for targeted marketing campaigns.
720
+
721
+ Unified Mentor Pvt. Ltd.(Remote)
722
+ Data Science Intern — 09/2024 – 10/2024
723
+ 1. Developed and optimized machine learning models to predict employee attrition, enabling proactive retention strategies and improving workforce stability.
724
+ 2. Conducted comprehensive data preprocessing, feature engineering, and exploratory data analysis (EDA) to identify key factors influencing employee turnover.
725
+ 3. Delivered actionable insights and visualized patterns through dashboards and reports, supporting HR teams in making data-driven decisions.
726
+ 4. Presented findings to stakeholders, translating complex analytics into clear, strategic recommendations for reducing attrition risk.
727
+
728
+
729
+ -------------------------
730
+
731
+ # Projects
732
+
733
+ Pranav Reddy's Projects
734
+
735
+ 1. 🛒 BigBasket SmartCart – AI Assistant for BigBasket Shopping
736
+ ---
737
+ ## 🧾 Introduction
738
+
739
+ The rapid evolution of AI technologies has created new opportunities for enhancing user experience in digital commerce. Leveraging state-of-the-art language models and retrieval systems, intelligent assistants can now understand complex queries, process vast amounts of product data, and deliver precise, context-aware responses. This project presents a scalable and robust AI-powered shopping assistant tailored for BigBasket's product ecosystem. Built using Retrieval-Augmented Generation (RAG), vector embeddings, and large language models (LLMs), the system enables efficient and intelligent product discovery through natural language interaction.
740
+
741
+ ---
742
+
743
+ ## ❗ Problem Statement
744
+
745
+ Online shoppers frequently seek personalized and context-specific product recommendations, such as identifying the best-rated skincare item at the lowest price. However, conventional search systems often fall short in understanding such nuanced queries, lacking the ability to interpret intent, compare attributes across products, and deliver concise, relevant results. This creates friction in the user journey, leading to suboptimal shopping experiences. There is a clear need for an intelligent assistant that can process natural language queries, reason over structured product data, and deliver accurate, insightful responses to aid decision-making.
746
+
747
+ ---
748
+
749
+ ## Business Goal:
750
+
751
+ To enhance the shopping experience, boost conversion rates, and optimize search efficiency by enabling natural language-based product search that understands user intent and delivers context-aware, personalized recommendations.
752
+
753
+
754
+
755
+ ## 💰 Business Impact (Revenue + Cost)
756
+
757
+ 💸 1. Increased Conversion Rates (↑ Revenue)
758
+
759
+ • Users find relevant products faster, leading to more product views, cart adds, and purchases
760
+
761
+ • Personalized recommendations match buyer intent better than traditional search
762
+
763
+ • Better UX = lower drop-off rates
764
+
765
+ 📈 Even a 1–2% uplift in conversions from improved product search can lead to significant revenue gains for a large marketplace like BigBasket.
766
+
767
+ 📉 2. Reduced Customer Support Queries (↓ Cost)
768
+
769
+ • AI assistant can handle informational and product-related queries
770
+
771
+ • Reduces manual intervention, live chat support, and email volume
772
+
773
+ • More self-service = less operational overhead
774
+
775
+ ⏱️ 3. Reduced Time-to-Purchase (↑ Efficiency)
776
+
777
+ • Customers make faster decisions because the assistant summarizes comparisons (e.g., price vs. rating trade-offs)
778
+
779
+ • This shortens the purchase journey and increases user satisfaction
780
+
781
+ 🧪 4. Rapid Experimentation & Deployment (↓ Dev Costs)
782
+
783
+ • The project is modular, Dockerized, and CI/CD enabled → easier to iterate and deploy
784
+
785
+ • Can be extended to other verticals (electronics, fashion) or other marketplaces with minimal changes
786
+
787
+ ---
788
+
789
+ ## 🚀 Features
790
+
791
+ 🔍 Natural Language Product Search
792
+ Users can ask queries like "cheapest skin care with highest rating" or "best perfume under ₹500".
793
+
794
+ 🧠 Query Rewriting with LLM
795
+ Uses Groq LLMs (gemma2-9b-it) to refine user queries for more precise retrieval.
796
+
797
+ 📄 Document Embedding & Vector Search
798
+ Preprocessed BigBasket product data embedded with thenlper/gte-small and indexed using FAISS.
799
+
800
+ 🤖 RAG Pipeline
801
+ Uses llama3-70b-8192 model for final answer generation based on retrieved and reranked results.
802
+
803
+ 🔁 Reranking with CrossEncoder
804
+ Improves accuracy using cross-encoder/ms-marco-MiniLM-L-6-v2.
805
+
806
+ 🌐 FastAPI Backend
807
+ Easily accessible via localhost:8000 or deployed server.
808
+
809
+ 🐳 Dockerized
810
+ Build once, run anywhere. Fully containerized using Docker.
811
+
812
+ 🚰 CI/CD with GitHub Actions
813
+ Automated testing, image build, and push to DockerHub.
814
+
815
+ 📜 Logging
816
+ Logging implemented for each step in the pipeline for transparency and debugging.
817
+
818
+ ---
819
+
820
+ ## 🗂️ Folder Structure
821
+
822
+ ```bash
823
+ BIGBASKET/
824
+ ├── .github/
825
+ │ └── workflows/
826
+ │ └── ci-cd.yml
827
+ ├── data/
828
+ │ └── BigBasket Products.csv
829
+ ├── logs/
830
+ │ ├── data_ingestion.log
831
+ │ ├── data_preprocessing.log
832
+ │ ├── query_rewriting.log
833
+ │ └── retrieval_generation.log
834
+ ├── src/
835
+ │ ├── utils/
836
+ │ │ └── logger.py
837
+ │ ├── __init__.py
838
+ │ ├── data_ingestion.py
839
+ │ ├── data_preprocessing.py
840
+ │ ├── query_rewritting.py
841
+ │ └── retrival_genaration.py
842
+ ├── static/
843
+ │ └── css/
844
+ │ └── style.css
845
+ ├── templates/
846
+ │ └── index.html
847
+ ├── tests/
848
+ ├── ui/
849
+ ├── main.py
850
+ ├── Dockerfile
851
+ ├── requirements.txt
852
+ ├── .env
853
+ ├── .dockerignore
854
+ ├── .gitignore
855
+ └── README.md
856
+ ```
857
+
858
+ ---
859
+
860
+ # 🧪 Local Development Setup
861
+
862
+ ```bash
863
+ # Clone the repository
864
+ git clone https://github.com/ka1817/BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping
865
+ cd BigBasket
866
+
867
+ # Create virtual environment
868
+ python -m venv venv
869
+ source venv/bin/activate # On Windows: venv\Scripts\activate
870
+
871
+ # Install dependencies
872
+ pip install -r requirements.txt
873
+
874
+ # Before Running the app set .env(environment variable GROQ_API_KEY)
875
+ uvicorn main:app --reload --port 8000
876
+ ```
877
+
878
+ ## 🐳 Docker Instructions
879
+
880
+ 🔧 1. Pull Image
881
+
882
+ ```bash
883
+ docker pull pranavreddy123/bigbasket-assistant:latest
884
+ ```
885
+
886
+ 🚀 2. Run the App (Detached Mode)
887
+
888
+ ```bash
889
+ docker run -d -p 8000:8000 \
890
+ -e GROQ_API_KEY=create groq api from groq cloud \
891
+ pranavreddy123/bigbasket-assistant:latest
892
+ ```
893
+
894
+ 🌐 3. Access the App
895
+
896
+ ```bash
897
+ http://localhost:8000
898
+ ```
899
+
900
+ ---
901
+
902
+ 🤖 Example Usage
903
+
904
+ Query: "Which is the cheapest hair product with high rating?"
905
+ Rewritten: "Find the most affordable hair care product with a high customer rating."
906
+ Response: "Garlic Oil - Vegetarian Capsule 500 mg by Sri Sri Ayurveda is available at ₹220 with a 4.1 rating."
907
+
908
+ ---
909
+
910
+ ## 🛠️ GitHub Actions (CI/CD)
911
+
912
+ File: .github/workflows/ci-cd.yml
913
+
914
+ ✅ CI-Test: Runs unit tests using pytest.
915
+
916
+ 🐳 CD-Docker: Builds Docker image and pushes to DockerHub.
917
+
918
+ Triggered on push to main or pull request.
919
+
920
+ ---
921
+
922
+ ## ☁️ Deployment on Amazon EC2
923
+
924
+ ### 1. Launch EC2 Instance (Ubuntu 20.04)
925
+
926
+ ### 2. SSH into your instance
927
+
928
+ ```bash
929
+ ssh -i "your-key.pem" ubuntu@your-ec2-ip
930
+ ```
931
+
932
+ ### 3. Install Docker
933
+
934
+ ```bash
935
+ sudo apt update
936
+ sudo apt install docker.io -y
937
+ sudo systemctl start docker
938
+ sudo systemctl enable docker
939
+ ```
940
+
941
+ ### 4. Pull and Run Docker Image
942
+
943
+ ```bash
944
+ docker pull pranavreddy123/bigbasket-assistant:latest
945
+ # Ensure your .env file is in the same directory, or create an API key using Groq Cloud and add it to the .env file
946
+ docker run -d --env-file .env -p 8000:8000 pranavreddy123/bigbasket-assistant:latest
947
+ ```
948
+
949
+ ## Access your app via `http://<your-ec2-public-ip>`
950
+
951
+ ## 🧠 Tech Stack
952
+
953
+ ✅ LLMs: Groq (gemma2-9b-it, llama3-70b-8192)
954
+
955
+ ✅ LangChain, FAISS, HuggingFace, CrossEncoder
956
+
957
+ ✅ FastAPI
958
+
959
+ ✅ Docker
960
+
961
+ ✅ GitHub Actions
962
+
963
+ ✅ AWS EC2
964
+
965
+ ✅ HTML/CSS
966
+
967
+ ---
968
+
969
+ ## 🔗 Links
970
+
971
+ 🔍 GitHub Repo: BigBasket-SmartCart-AI-Assistant-for-BigBasket-Shopping
972
+
973
+ 🐳 DockerHub: pranavreddy123/bigbasket-assistant
974
+
975
+ ---
976
+
977
+ ## 🧑‍💻 Developed By
978
+
979
+ Pranav Reddy
980
+
981
+
982
+ 2.Netflix Customer Churn Prediction – End-to-End ML System** *(Personal Project)* \[GitHub]
983
+
984
+ * Developed a complete machine learning pipeline to predict customer churn, achieving 99% recall and 0.99 ROC AUC through feature engineering, hyperparameter tuning, and cross-validation.
985
+ * Performed in-depth EDA to identify key churn drivers such as low engagement, infrequent logins, and payment methods, improving model interpretability and business insights.
986
+ * Implemented reproducible MLOps workflows with data versioning using DVC and AWS S3, and tracked experiments, metrics, and model artifacts using MLflow.
987
+ * Designed and deployed a FastAPI-based REST API with HTML/CSS frontend for real-time predictions, containerized the application using Docker, and automated CI/CD using GitHub Actions for deployment on AWS EC2.
988
+
989
+ 📊 Netflix Customer Churn Prediction
990
+ # 📊 Netflix Customer Churn Prediction
991
+
992
+ # Project Overview
993
+
994
+ Netflix, like many subscription-based platforms, faces the challenge of customer churn. Retaining existing customers is significantly more cost-effective than acquiring new ones. This project delivers a full-scale machine learning solution to predict customer churn using behavioral and subscription data, from ingestion to deployment via a FastAPI interface.
995
+
996
+ This repository presents a production-grade, explainable, and reproducible ML pipeline with CI/CD, experiment tracking (**MLflow**), data versioning (**DVC**), and containerized deployment using **Docker**.
997
+
998
+ ---
999
+
1000
+ # Problem Statement
1001
+
1002
+ Netflix seeks to proactively identify users likely to cancel their subscriptions. Predicting churn enables targeted interventions to retain users and minimize revenue loss.
1003
+
1004
+ > **Goal:** Build an ML classification model that predicts churn based on customer behavior and plan details.
1005
+
1006
+ ---
1007
+
1008
+ ## Key Features Used
1009
+
1010
+ | Feature | Type | Description |
1011
+ | -------------------------- | ----------- | ---------------------------------------------- |
1012
+ | watch\_hours | Numerical | Total hours watched |
1013
+ | last\_login\_days | Numerical | Days since last login |
1014
+ | number\_of\_profiles | Numerical | Total profiles under the account |
1015
+ | avg\_watch\_time\_per\_day | Numerical | Daily average watch time |
1016
+ | subscription\_type | Categorical | Subscription level: Basic, Standard, Premium |
1017
+ | payment\_method | Categorical | Payment method: Credit Card, UPI, PayPal, etc. |
1018
+ | churned | Target | 1 = Churned, 0 = Not churned |
1019
+
1020
+ ---
1021
+
1022
+ ## Key EDA Insights
1023
+
1024
+ ### 🔬 Feature Significance
1025
+
1026
+ | Feature | Test | p-value | Significant? |
1027
+ | -------------------------- | -------------- | ------- | ------------ |
1028
+ | subscription\_type | Chi-Square | 0.0000 | ✅ Yes |
1029
+ | payment\_method | Chi-Square | 0.0000 | ✅ Yes |
1030
+ | number\_of\_profiles | Chi-Square | 0.0000 | ✅ Yes |
1031
+ | watch\_hours | Mann-Whitney U | 0.0000 | ✅ Yes |
1032
+ | last\_login\_days | Mann-Whitney U | 0.0000 | ✅ Yes |
1033
+ | avg\_watch\_time\_per\_day | Mann-Whitney U | 0.0000 | ✅ Yes |
1034
+ | age | Mann-Whitney U | 0.7803 | ❌ No |
1035
+ | gender, region, device | Chi-Square | > 0.3 | ❌ No |
1036
+
1037
+ > ✅ These statistically significant features were included in the final model pipeline.
1038
+
1039
+ ---
1040
+
1041
+ ## Project Architecture
1042
+
1043
+ ```bash
1044
+ netflix-churn-prediction/
1045
+ ├── data/ # Raw and processed data
1046
+ ├── models/ # Trained model binaries
1047
+ ├── reports/ # Classification reports & plots
1048
+ ├── static/ # CSS
1049
+ ├── templates/ # HTML UI
1050
+ ├── src/
1051
+ │ ├── data_ingestion.py # Load dataset
1052
+ │ ├── data_preprocessing.py # Pipeline for scaling & encoding
1053
+ │ └── model_training.py # ML training & evaluation
1054
+ ├── main.py # FastAPI backend
1055
+ ├── Dockerfile # Containerization
1056
+ ├── .dvc/ # DVC for data version control
1057
+ ├── .github/workflows/ # CI/CD GitHub Actions
1058
+ └── README.md
1059
+ ```
1060
+
1061
+ ---
1062
+
1063
+ ## ⚙️ End-to-End ML Workflow
1064
+
1065
+ ### 1️⃣ Data Ingestion
1066
+
1067
+ * Loads `.csv` into DataFrame
1068
+ * Handles errors and logs shape/summary
1069
+
1070
+ ### 2️⃣ Preprocessing
1071
+
1072
+ * OneHotEncoding (categorical)
1073
+ * StandardScaler (numerical)
1074
+ * Uses `ColumnTransformer` for pipeline modularity
1075
+
1076
+ ### 3️⃣ Model Training
1077
+
1078
+ * Models: `RandomForest`, `GradientBoosting`, `SVC`
1079
+ * `GridSearchCV` for hyperparameter tuning
1080
+ * Model artifacts saved to `models/`
1081
+ * ROC curves + classification reports saved to `reports/`
1082
+
1083
+ ### 4️⃣ MLflow Tracking ✅
1084
+
1085
+ * Tracks experiment metadata, metrics, parameters
1086
+ * Stores models and artifacts
1087
+ * UI accessible at `localhost:5000`
1088
+
1089
+ ---
1090
+
1091
+ ## 🧪 Model Performance
1092
+
1093
+ | Model | Accuracy | F1 Score | ROC AUC (Test) | ROC AUC (CV) | Notes |
1094
+ | ----------------- | -------- | -------- | -------------- | ------------ | ----------------------------- |
1095
+ | Random Forest | 0.99 | 0.99 | **0.9995** | 0.9987 | ✅ Best overall【13†source】 |
1096
+ | Gradient Boosting | 0.99 | 0.99 | 0.9989 | 0.9991 | Robust & efficient【12†source】 |
1097
+ | SVC | 0.93 | 0.93 | 0.9844 | 0.9822 | Lightweight【14†source】 |
1098
+
1099
+ ---
1100
+
1101
+ ## 🌐 FastAPI Deployment
1102
+
1103
+ ### 🔧 API Endpoints:
1104
+
1105
+ * `/`: HTML frontend form for manual input
1106
+ * `/api/predict`: JSON-based API for programmatic inference
1107
+
1108
+ ### 🔌 Model Used:
1109
+
1110
+ * Random Forest (best AUC + accuracy)
1111
+ * Accepts form or JSON input
1112
+ * Returns churn prediction + confidence
1113
+
1114
+ ---
1115
+
1116
+ ## 🐳 Docker Setup
1117
+
1118
+ ```Dockerfile
1119
+ FROM python:3.10-slim
1120
+ WORKDIR /app
1121
+ COPY requirements.txt .
1122
+ RUN pip install -r requirements.txt
1123
+ COPY . .
1124
+ EXPOSE 8000
1125
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
1126
+ ```
1127
+
1128
+ Run locally:
1129
+
1130
+ ```bash
1131
+ docker build -t netflix-churn .
1132
+ docker run -p 8000:8000 netflix-churn
1133
+ ```
1134
+
1135
+ ---
1136
+
1137
+ ## 🔁 CI/CD Pipeline (GitHub Actions)
1138
+
1139
+ ### ✅ Stages:
1140
+
1141
+ 1. **Test Phase**
1142
+
1143
+ * Install dependencies
1144
+ * Run `pytest` on unit tests
1145
+ * Pull versioned data using `dvc pull`
1146
+
1147
+ 2. **Build Phase**
1148
+
1149
+ * Docker image build with `CACHEBUST` arg
1150
+ * Push to DockerHub using GitHub Secrets
1151
+
1152
+ 3. **Deploy Phase**
1153
+
1154
+ * SSH into EC2 instance
1155
+ * Stop, remove old container
1156
+ * Pull and launch updated Docker image
1157
+
1158
+ ### 🔐 GitHub Repository Secrets
1159
+
1160
+ | Name | Purpose |
1161
+ | ----------------------- | ---------------------------------- |
1162
+ | `AWS_ACCESS_KEY_ID` | AWS auth for DVC S3 |
1163
+ | `AWS_SECRET_ACCESS_KEY` | AWS auth for DVC S3 |
1164
+ | `DOCKER_USERNAME` | DockerHub username for push |
1165
+ | `DOCKER_PASSWORD` | DockerHub password/token |
1166
+ | `EC2_HOST` | Public IP/DNS of EC2 instance |
1167
+ | `EC2_USER` | SSH user for EC2 login |
1168
+ | `EC2_SSH_KEY` | Private SSH key for GitHub Actions |
1169
+
1170
+ ---
1171
+
1172
+ ## 🧬 Data Versioning with DVC
1173
+
1174
+ * Tracks raw and preprocessed data versions
1175
+ * Uses `.dvc/config` to connect to **AWS S3** remote
1176
+ * Run `dvc push` and `dvc pull` to sync across environments
1177
+ * Ensures reproducibility in CI and local experiments
1178
+
1179
+ ---
1180
+
1181
+ ## 📌 Business Value & Insights
1182
+
1183
+ * 🧠 **High-risk churn users** are linked to:
1184
+
1185
+ * Low engagement (low watch hours)
1186
+ * Infrequent logins
1187
+ * Basic plans & non-card payments
1188
+
1189
+ * 📈 **Operational Benefits**:
1190
+
1191
+ * Preemptive retention campaigns
1192
+ * Personalized offers to vulnerable users
1193
+ * Reduce marketing costs via targeted outreach
1194
+
1195
+ ---
1196
+
1197
+ ## ✅ Run Locally (No Docker)
1198
+
1199
+ ```bash
1200
+ git clone <repo_url>
1201
+ cd netflix-churn-prediction
1202
+ python src/model_training.py # Train all models
1203
+ uvicorn main:app --reload # Launch API server
1204
+ ```
1205
+
1206
+ ---
1207
+ ## Summary
1208
+
1209
+ | **Component** | **Implemented** | **Tool/Service Used** |
1210
+ | ------------------------ | --------------- | ----------------------------------------- |
1211
+ | **Data Versioning** | ✅ | `DVC` with `AWS S3` remote |
1212
+ | **Data Ingestion** | ✅ | `pandas`, custom Python class |
1213
+ | **Data Preprocessing** | ✅ | `scikit-learn` Pipelines |
1214
+ | **Model Training** | ✅ | `scikit-learn`, `GridSearchCV` |
1215
+ | **Experiment Tracking** | ✅ | `MLflow` (local server: `127.0.0.1:5000`) |
1216
+ | **Model Evaluation** | ✅ | `classification_report`, ROC AUC |
1217
+ | **Model Packaging** | ✅ | `joblib` for serialization |
1218
+ | **API Deployment** | ✅ | `FastAPI` on `AWS EC2` |
1219
+ | **Web UI** | ✅ | HTML + Bootstrap via Jinja2 |
1220
+ | **Containerization** | ✅ | `Docker` (with `Dockerfile`) |
1221
+ | **CI/CD Pipeline** | ✅ | `GitHub Actions` |
1222
+ | **Cloud Hosting** | ✅ | `AWS EC2`, SSH-based deployment |
1223
+ | **Secrets Management** | ✅ | `GitHub Secrets` |
1224
+ | **Testing** | ✅ | `pytest`, CI-tested |
1225
+
1226
+ ---
1227
+ ## 🙌 Author
1228
+
1229
+ * 👨‍💻 Katta Sai Pranav Reddy
1230
+
1231
+ ---
1232
+ ## 🔗 Links
1233
+
1234
+ 🔍 GitHub Repo: Netflix-Customer-Churn-Prediction-Using-Machine-Learning
1235
+
1236
+ 🐳 DockerHub: pranavreddy123/netflix-churn-prediction
1237
+
1238
+ ## 📎 Tech Stack
1239
+
1240
+ * **Python 3.10**
1241
+ * **Scikit-learn**, **MLflow**, **DVC**, **FastAPI**, **Docker**
1242
+ * **GitHub Actions**, **AWS EC2**, **S3 Remote Storage**
1243
+ ---
1244
+
1245
+ ### **Skills**
1246
+
1247
+ * **Tools:** MLflow, DVC, Docker, Git, GitHub Actions, AWS (EC2, S3, ECR), FAISS, Pinecone, Hugging Face, LangChain, LangSmith, FastAPI
1248
+ * **Programming & Technical Skills:** Python, SQL, HTML, CSS, Scikit-learn, TensorFlow, Keras, Statistics
1249
+ * **Data Science & Machine Learning:** Data Preprocessing, EDA, Feature Engineering, Model Training & Evaluation, Hyperparameter Tuning, Clustering, MLOps, Semantic Search, Retrieval-Augmented Generation (RAG), CNN, RNN, GPT, Transformers, Fine-Tuning, Prompt Engineering
1250
+ * **Data Visualization & Analysis:** Pandas, NumPy, Matplotlib, Seaborn
1251
+
1252
+ ---
1253
+
1254
+ hobbies section
1255
+
1256
+ ---
1257
+
1258
+ ### **Hobbies & Interests**
1259
+ Hobbies & Interests
1260
+
1261
+ * Playing Cricket
1262
+ * Watching Football
1263
+ * Reading Books
1264
+ * Exploring Latest Advancements in Artificial Intelligence
1265
+ * Browsing the Internet for Tech & Knowledge Updates
1266
+
1267
+ ---
1268
+
1269
+ ### Contact Information
1270
+
1271
+ Contact Information
1272
+
1273
+ 📞 Phone: +91 93475 41040
1274
+ 📧 Email: [kattapranavreddy@gmail.com](mailto:kattapranavreddy@gmail.com)
1275
+ 💻 GitHub: [github.com/ka1817](https://github.com/ka1817)
1276
+ 🔗 LinkedIn: [linkedin.com/in/pranav-reddy-katta](https://www.linkedin.com/in/pranav-reddy-katta/)
1277
+ ---
1278
+
1279
+ ---
1280
+ Certifications By Pranav Reddy
1281
+ Certifications:
1282
+ (All from Udemy)
1283
+ - Python for Data Science and Machine Learning
1284
+ - The Complete SQL Bootcamp
1285
+ - Generative AI with LangChain and HuggingFace
1286
+ - End-To-End MLOps Bootcamp
1287
+
1288
+ ---
1289
+
local_faiss_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c55f659ccefaca4d036e0cea1353effc17f22e7a080035c43c30739386c6806
3
+ size 101421
local_faiss_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98d51e77c2b495b1ec0b617fd15f33d955debb37bbdd2c5e14a897ac348f3a7e
3
+ size 65378
main.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.templating import Jinja2Templates
5
+ from pydantic import BaseModel
6
+ from contextlib import asynccontextmanager
7
+ from src.retrival_generation import RetrievalGeneration
8
+ import uvicorn
9
+
10
+ class QueryRequest(BaseModel):
11
+ query: str
12
+
13
+ retriever = RetrievalGeneration(vectorstore_path="local_faiss_index")
14
+
15
+ @asynccontextmanager
16
+ async def lifespan(app: FastAPI):
17
+ retriever.init_vectorstore(rebuild=False)
18
+ retriever.build_rag_chain(k=10, top_n=5)
19
+ yield
20
+
21
+ app = FastAPI(lifespan=lifespan)
22
+
23
+ app.mount("/static", StaticFiles(directory="static"), name="static")
24
+
25
+ templates = Jinja2Templates(directory="templates")
26
+
27
+ @app.get("/", response_class=HTMLResponse)
28
+ def home(request: Request):
29
+ return templates.TemplateResponse("index.html", {"request": request})
30
+
31
+ @app.post("/predict")
32
+ def predict(request: QueryRequest):
33
+ response = retriever.rag_chain.invoke(request.query)
34
+ return {"response": response}
35
+
36
+ if __name__ == "__main__":
37
+ uvicorn.run("main:app", host="0.0.0.0", port=4000, reload=True)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-groq
3
+ langchain-community
4
+ pypdf
5
+ python-dotenv
6
+ fastapi
7
+ sentence-transformers
8
+ faiss-cpu
9
+ uvicorn
src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .data_ingestion import *
2
+ from .data_preprocessing import *
3
+ from .retrival_generation import *
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (203 Bytes). View file
 
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (278 Bytes). View file
 
src/__pycache__/data_ingestion.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
src/__pycache__/data_ingestion.cpython-311.pyc ADDED
Binary file (2.79 kB). View file
 
src/__pycache__/data_preprocessing.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
src/__pycache__/data_preprocessing.cpython-311.pyc ADDED
Binary file (2.41 kB). View file
 
src/__pycache__/evaluation.cpython-310.pyc ADDED
Binary file (4.29 kB). View file
 
src/__pycache__/retrival_generation.cpython-310.pyc ADDED
Binary file (4.22 kB). View file
 
src/__pycache__/retrival_generation.cpython-311.pyc ADDED
Binary file (6.46 kB). View file
 
src/data_ingestion.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from langchain_community.document_loaders import TextLoader
4
+
5
+ logging.basicConfig(
6
+ level=logging.INFO,
7
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
8
+ )
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DataIngestion:
13
+
14
+ def __init__(self, path: str | None = None):
15
+ if path is None:
16
+ default_path = os.path.join(
17
+ os.path.dirname(__file__), "..", "data", "info.txt"
18
+ )
19
+ self.path = os.path.abspath(default_path)
20
+ logger.info(f"No path provided. Using default file: {self.path}")
21
+ else:
22
+ self.path = os.path.abspath(path)
23
+ logger.info(f"Using custom file path: {self.path}")
24
+
25
+ def load_data(self):
26
+ logger.debug(f"Checking if file exists at: {self.path}")
27
+ if not os.path.exists(self.path):
28
+ logger.error(f"File not found at {self.path}")
29
+ raise FileNotFoundError(f"File not found: {self.path}")
30
+
31
+ logger.info(f"Loading file: {self.path}")
32
+ loader = TextLoader(self.path, encoding="utf-8")
33
+ docs = loader.load()
34
+ logger.info(f"Loaded {len(docs)} documents from {self.path}")
35
+ return docs
36
+
37
+
38
+
src/data_preprocessing.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from src.data_ingestion import DataIngestion
4
+
5
+ logging.basicConfig(
6
+ level=logging.INFO,
7
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
8
+ )
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DataSplitting:
13
+
14
+ def __init__(self, chunk_size: int = 40, chunk_overlap: int = 20):
15
+ self.chunk_size = chunk_size
16
+ self.chunk_overlap = chunk_overlap
17
+ logger.info(
18
+ f"Initialized DataSplitting with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}"
19
+ )
20
+
21
+ def chunking(self):
22
+ logger.info("Starting document ingestion before splitting...")
23
+ data = DataIngestion()
24
+ docs = data.load_data()
25
+ logger.info(f"Received {len(docs)} documents for splitting.")
26
+
27
+ splitter = RecursiveCharacterTextSplitter(
28
+ chunk_size=self.chunk_size,
29
+ chunk_overlap=self.chunk_overlap,
30
+ )
31
+ logger.debug("Splitter initialized. Splitting documents...")
32
+ chunks = splitter.split_documents(docs)
33
+
34
+ logger.info(f"Created {len(chunks)} chunks from {len(docs)} documents.")
35
+ return chunks
36
+
37
+
src/evaluation.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from datasets import Dataset
4
+
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.retrievers import ContextualCompressionRetriever
8
+ from langchain.retrievers.document_compressors.cross_encoder_rerank import CrossEncoderReranker
9
+ from langchain_community.cross_encoders import HuggingFaceCrossEncoder
10
+
11
+ from langchain_groq import ChatGroq
12
+ from ragas import evaluate
13
+ from ragas.metrics import context_precision, context_recall, faithfulness, answer_relevancy
14
+ from ragas.run_config import RunConfig
15
+
16
+ from src.retrival_generation import RetrievalGeneration
17
+
18
+
19
+ class Evaluation:
20
+ def __init__(self, vectorstore_path: str, llm_model: str = "llama-3.3-70b-versatile"):
21
+ load_dotenv()
22
+ self.groq_api_key = os.getenv("GROQ_API_KEY")
23
+
24
+ self.llm = ChatGroq(api_key=self.groq_api_key, model=llm_model)
25
+
26
+ self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
27
+
28
+ self.vectorstore_path = vectorstore_path
29
+ self.vectorstore = FAISS.load_local(
30
+ self.vectorstore_path,
31
+ self.embeddings,
32
+ allow_dangerous_deserialization=True
33
+ )
34
+
35
+ self.rg = RetrievalGeneration(vectorstore_path=vectorstore_path)
36
+ self.rg.init_vectorstore()
37
+ self.qa = self.rg.build_rag_chain()
38
+
39
+ def run(self, questions: list, ground_truth: list, use_reranker: bool = False):
40
+ """Run evaluation with or without reranking"""
41
+
42
+ if use_reranker:
43
+ cross_encoder_model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
44
+ compressor = CrossEncoderReranker(model=cross_encoder_model, top_n=3)
45
+ retriever = ContextualCompressionRetriever(
46
+ base_compressor=compressor,
47
+ base_retriever=self.vectorstore.as_retriever(search_kwargs={"k": 10})
48
+ )
49
+ else:
50
+ retriever = self.vectorstore.as_retriever(search_kwargs={"k": 10})
51
+
52
+ answers, contexts = [], []
53
+ for query in questions:
54
+ answers.append(self.qa.invoke(query))
55
+ contexts.append([doc.page_content for doc in retriever.get_relevant_documents(query)])
56
+
57
+ data = {
58
+ "question": questions,
59
+ "ground_truth": ground_truth,
60
+ "answer": answers,
61
+ "contexts": contexts
62
+ }
63
+
64
+ dataset = Dataset.from_dict(data)
65
+
66
+ run_config = RunConfig(
67
+ timeout=290,
68
+ max_retries=5,
69
+ max_wait=30,
70
+ max_workers=1
71
+ )
72
+
73
+ result = evaluate(
74
+ dataset=dataset,
75
+ metrics=[context_precision, context_recall, faithfulness, answer_relevancy],
76
+ llm=self.llm,
77
+ embeddings=self.embeddings,
78
+ run_config=run_config,
79
+ batch_size=1
80
+ )
81
+
82
+ return result
83
+ if __name__ == "__main__":
84
+ base_dir = os.path.dirname(os.path.abspath(__file__))
85
+ vectorstore_path = os.path.join(base_dir, "..", "local_faiss_index")
86
+
87
+ evaluation = Evaluation(vectorstore_path)
88
+
89
+ questions = [
90
+ "What were Katta Sai Pranav Reddy’s 10th class marks and CGPA?",
91
+ "What subjects did Pranav Reddy study in 12th (Intermediate) and what were his marks?",
92
+ "Can you summarize Pranav Reddy’s professional and project experience?"
93
+ ]
94
+
95
+ ground_truth = [
96
+ "Katta Sai Pranav Reddy completed his SSC in March 2019 at Ekalavya Foundation School, Nalgonda, securing A1 grades in most subjects and a B1 in Hindi, with an overall CGPA of 9.5.",
97
+ "In March 2021, Pranav Reddy finished his Intermediate education, achieving nearly full marks in English, Sanskrit, HE, and optional subjects like Mathematics, Physics, and Chemistry, with a total of 982 marks.",
98
+ "Pranav Reddy is an AI and ML engineer with internship experience at iNeuron Intelligence and Unified Mentor, where he worked on customer segmentation and attrition prediction. His projects include the BigBasket SmartCart AI Assistant and Netflix Churn Prediction, showcasing skills in Python, ML pipelines, FAISS, FastAPI, and Generative AI solutions."
99
+ ]
100
+
101
+ # Run without reranker
102
+ print("🔹 Baseline Evaluation (no reranker)")
103
+ baseline_result = evaluation.run(questions, ground_truth, use_reranker=False)
104
+ print(baseline_result)
105
+
106
+ # Run with reranker
107
+ print("\n🔹 Evaluation with Reranker")
108
+ rerank_result = evaluation.run(questions, ground_truth, use_reranker=True)
109
+ print(rerank_result)
src/retrival_generation.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import warnings
4
+ from dotenv import load_dotenv
5
+ from src.data_preprocessing import DataSplitting
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_groq import ChatGroq
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.schema import StrOutputParser
11
+ from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
12
+ from langchain.retrievers import ContextualCompressionRetriever
13
+ from langchain.retrievers.document_compressors.cross_encoder_rerank import CrossEncoderReranker
14
+ from langchain_community.cross_encoders import HuggingFaceCrossEncoder
15
+
16
+ warnings.filterwarnings("ignore")
17
+
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
21
+ handlers=[logging.StreamHandler()]
22
+ )
23
+
24
+
25
+ logger = logging.getLogger("RetrievalGeneration")
26
+
27
+ load_dotenv()
28
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
29
+ if not GROQ_API_KEY:
30
+ raise ValueError("GROQ_API_KEY not found in environment variables.")
31
+
32
+ llm = ChatGroq(model=os.getenv("GROQ_MODEL", "llama-3.1-8b-instant"))
33
+
34
+
35
+ class RetrievalGeneration:
36
+ def __init__(self, vectorstore_path: str = "faiss_store"):
37
+ self.vectorstore_path = vectorstore_path
38
+ self.vectorstore = None
39
+ self.rag_chain = None
40
+ logger.info("RetrievalGeneration initialized with path: %s", vectorstore_path)
41
+
42
+ def init_vectorstore(self, rebuild: bool = False):
43
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
44
+ logger.info("Embeddings model loaded.")
45
+
46
+ if os.path.exists(self.vectorstore_path) and not rebuild:
47
+ logger.info("Loading existing FAISS index from: %s", self.vectorstore_path)
48
+ self.vectorstore = FAISS.load_local(
49
+ self.vectorstore_path, embeddings, allow_dangerous_deserialization=True
50
+ )
51
+ else:
52
+ logger.warning("Building new FAISS index...")
53
+ chunks = DataSplitting(chunk_size=2000, chunk_overlap=800).chunking()
54
+ logger.info("Data split into %d chunks", len(chunks))
55
+ self.vectorstore = FAISS.from_documents(chunks, embeddings)
56
+ self.vectorstore.save_local(self.vectorstore_path)
57
+ logger.info("FAISS index saved at: %s", self.vectorstore_path)
58
+
59
+ return self.vectorstore
60
+
61
+ def build_rag_chain(self, k: int = 10,top_n: int = 5):
62
+ if not self.vectorstore:
63
+ raise ValueError("Vectorstore not initialized. Run init_vectorstore() first.")
64
+
65
+ logger.info("Creating retriever from FAISS vectorstore (top_k=%d)...", k)
66
+ cross_encoder_model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
67
+
68
+ compressor = CrossEncoderReranker(model=cross_encoder_model, top_n=top_n)
69
+
70
+ retriever = ContextualCompressionRetriever(base_compressor=compressor,base_retriever=self.vectorstore.as_retriever(search_kwargs={"k": k}))
71
+
72
+
73
+ prompt = PromptTemplate(
74
+ template="""
75
+ You are a professional and concise AI assistant that answers questions
76
+ about the career, education, skills, projects, certifications, and professional
77
+ background of **Katta Sai Pranav Reddy**.
78
+
79
+ Your job is to:
80
+ - Use ONLY the provided context to answer.
81
+ - Be recruiter-friendly: structured, clear, and professional in tone.
82
+ - If the question is unrelated to Katta Sai Pranav Reddy’s professional profile,
83
+ politely decline by saying:
84
+ "I can only answer questions related to the professional background of Katta Sai Pranav Reddy."
85
+ - If the context does not provide enough information, say:
86
+ "The available information does not cover that detail."
87
+
88
+ Context:
89
+ {context}
90
+
91
+ Question:
92
+ {question}
93
+
94
+ Answer (clear, structured, recruiter-focused):
95
+ """,
96
+ input_variables=["context", "question"]
97
+ )
98
+
99
+ self.rag_chain = (
100
+ RunnableParallel({
101
+ "context": retriever,
102
+ "question": RunnablePassthrough()
103
+ })
104
+ | prompt
105
+ | llm
106
+ | StrOutputParser()
107
+ )
108
+
109
+ logger.info("RAG chain with reranking successfully built.")
110
+ return self.rag_chain
111
+
112
+
static/style.css ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: Arial, sans-serif;
3
+ background: #f3f6fa;
4
+ margin: 0;
5
+ height: 100vh;
6
+ width: 100vw;
7
+ overflow: hidden;
8
+ }
9
+
10
+ @media (min-width: 769px) {
11
+ body {
12
+ display: flex;
13
+ justify-content: center;
14
+ align-items: center;
15
+ }
16
+
17
+ .chatbot-container {
18
+ width: 650px;
19
+ height: 580px;
20
+ border-radius: 12px;
21
+ box-shadow: 0 8px 20px rgba(0,0,0,0.15);
22
+ }
23
+ }
24
+
25
+ @media (max-width: 700px) {
26
+ body {
27
+ display: block;
28
+ width: 100vw;
29
+ height: 100vh;
30
+ margin: 0;
31
+ padding: 0;
32
+ }
33
+
34
+ .chatbot-container {
35
+ width: 100vw;
36
+ height: 94vh;
37
+ border-radius: 0;
38
+ box-shadow: none;
39
+ display: flex;
40
+ flex-direction: column;
41
+ position: fixed;
42
+ top: 0;
43
+ left: 0;
44
+ padding-bottom: env(safe-area-inset-bottom);
45
+ overflow: hidden;
46
+ }
47
+
48
+ .chat-header h2 {
49
+ font-size: 20px;
50
+ }
51
+
52
+ .chat-body {
53
+ font-size: 16px;
54
+ padding: 12px;
55
+ overflow-y: auto;
56
+ flex: 1 1 auto;
57
+ min-height: 0;
58
+ }
59
+
60
+ #user-input {
61
+ font-size: 16px;
62
+ padding: 12px;
63
+ }
64
+
65
+ #send-btn {
66
+ font-size: 16px;
67
+ padding: 12px 16px;
68
+ }
69
+
70
+ .quick-options {
71
+ display: flex;
72
+ flex-wrap: wrap;
73
+ justify-content: space-between;
74
+ padding: 10px;
75
+ background: #f4f4f9;
76
+ border-top: 1px solid #ddd;
77
+ position: sticky;
78
+ bottom: 0;
79
+ left: 0;
80
+ right: 0;
81
+ max-height: 45vh;
82
+ overflow-y: auto;
83
+ flex-shrink: 0;
84
+ }
85
+
86
+ .option-btn {
87
+ flex: 1 1 48%;
88
+ font-size: 16px;
89
+ margin: 6px 1%;
90
+ padding: 14px;
91
+ white-space: normal;
92
+ word-break: break-word;
93
+ text-align: center;
94
+ box-sizing: border-box;
95
+ }
96
+ }
97
+
98
+ @supports (-webkit-touch-callout: none) {
99
+ @media (max-width: 700px) {
100
+ .chatbot-container {
101
+ height: 88svh;
102
+ height: 88vh;
103
+ }
104
+ }
105
+ }
106
+
107
+ .chatbot-container {
108
+ background: #fff;
109
+ display: flex;
110
+ flex-direction: column;
111
+ overflow: hidden;
112
+ }
113
+
114
+ .chat-header {
115
+ background: linear-gradient(135deg, #6366f1, #3b82f6);
116
+ color: white;
117
+ padding: 12px;
118
+ display: flex;
119
+ justify-content: space-between;
120
+ align-items: center;
121
+ }
122
+
123
+ .chat-header h2 {
124
+ margin: 0;
125
+ font-size: 16px;
126
+ }
127
+
128
+ .github-btn {
129
+ display: flex;
130
+ align-items: center;
131
+ justify-content: center;
132
+ margin-left: auto;
133
+ }
134
+
135
+ .github-btn img {
136
+ width: 26px;
137
+ height: 26px;
138
+ cursor: pointer;
139
+ filter: invert(1);
140
+ transition: transform 0.2s ease;
141
+ }
142
+
143
+ .github-btn img:hover {
144
+ transform: scale(1.2);
145
+ }
146
+
147
+ .chat-body {
148
+ flex: 1 1 auto;
149
+ padding: 10px;
150
+ overflow-y: auto;
151
+ background: #fafafa;
152
+ display: flex;
153
+ flex-direction: column;
154
+ min-height: 0;
155
+ }
156
+
157
+ .bot-msg {
158
+ background: #e5edff;
159
+ padding: 8px 12px;
160
+ border-radius: 10px;
161
+ margin: 4px 0;
162
+ max-width: 90%;
163
+ line-height: 1.4;
164
+ display: inline-block;
165
+ word-wrap: break-word;
166
+ overflow-wrap: anywhere;
167
+ }
168
+
169
+ .user-msg {
170
+ background: #d1fae5;
171
+ padding: 8px 12px;
172
+ border-radius: 10px;
173
+ margin: 4px 0;
174
+ max-width: 90%;
175
+ align-self: flex-start;
176
+ text-align: left;
177
+ line-height: 1.4;
178
+ display: inline-block;
179
+ word-wrap: break-word;
180
+ overflow-wrap: anywhere;
181
+ }
182
+
183
+ .thinking {
184
+ font-style: italic;
185
+ color: #666;
186
+ background: #f3f4f6;
187
+ animation: blink 1.2s infinite;
188
+ align-self: flex-start;
189
+ text-align: left;
190
+ line-height: 1.3;
191
+ display: inline-block;
192
+ }
193
+
194
+ @keyframes blink {
195
+ 0% { opacity: 0.4; }
196
+ 50% { opacity: 1; }
197
+ 100% { opacity: 0.4; }
198
+ }
199
+
200
+ .chat-footer {
201
+ display: flex;
202
+ padding: 10px;
203
+ border-top: 1px solid #ddd;
204
+ background: #f9f9f9;
205
+ }
206
+
207
+ #user-input {
208
+ flex: 1;
209
+ padding: 8px;
210
+ border: 1px solid #ccc;
211
+ border-radius: 8px;
212
+ }
213
+
214
+ #send-btn {
215
+ margin-left: 6px;
216
+ padding: 8px 12px;
217
+ background: #6366f1;
218
+ color: white;
219
+ border: none;
220
+ border-radius: 8px;
221
+ cursor: pointer;
222
+ }
223
+
224
+ .quick-options {
225
+ display: flex;
226
+ flex-wrap: wrap;
227
+ padding: 10px;
228
+ background: #f4f4f9;
229
+ border-top: 1px solid #ddd;
230
+ }
231
+
232
+ .option-btn {
233
+ flex: 1 1 100%;
234
+ background: #eef2ff;
235
+ border: none;
236
+ padding: 10px;
237
+ margin: 6px 0;
238
+ border-radius: 8px;
239
+ cursor: pointer;
240
+ transition: 0.2s;
241
+ text-align: center;
242
+ word-wrap: break-word;
243
+ white-space: normal;
244
+ }
245
+
246
+ .option-btn:hover {
247
+ background: #dbeafe;
248
+ }
templates/index.html ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Pranav Reddy Portfolio Chatbot</title>
7
+ <link rel="stylesheet" href="/static/style.css?v=6">
8
+ </head>
9
+ <body>
10
+ <div class="chatbot-container">
11
+ <div class="chat-header">
12
+ <h2>Ask Pranav</h2>
13
+
14
+ <a href="https://github.com/ka1817" target="_blank" class="github-btn">
15
+ <img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg" alt="GitHub">
16
+ </a>
17
+ </div>
18
+
19
+ <div class="chat-body" id="chat-body">
20
+ <div class="bot-msg">👋 Hi! I’m Pranav Reddy’s assistant. What would you like to know?</div>
21
+ </div>
22
+
23
+ <div class="chat-footer">
24
+ <input type="text" id="user-input" placeholder="Ask a question..." />
25
+ <button id="send-btn">➤</button>
26
+ </div>
27
+
28
+ <div class="quick-options">
29
+ <button class="option-btn" data-query="what are Pranav Reddy’s Projects-BigBasket Samrt Cart and Netflex churn Prediction">
30
+ 📂 Project Details
31
+ </button>
32
+ <button class="option-btn" data-query="How can I contact Pranav Reddy?">
33
+ 📞 Contact Details
34
+ </button>
35
+ <button class="option-btn" data-query="What are Pranav Reddy’s Skills?">
36
+ 💡 Skills
37
+ </button>
38
+ <button class="option-btn" data-query="Share Pranav Reddy’s Work Experience">
39
+ 💼 Experience
40
+ </button>
41
+ </div>
42
+ </div>
43
+
44
+ <script>
45
+ const chatBody = document.getElementById("chat-body");
46
+ const userInput = document.getElementById("user-input");
47
+ const sendBtn = document.getElementById("send-btn");
48
+ const optionButtons = document.querySelectorAll(".option-btn");
49
+
50
+ function appendMessage(sender, text, extraClass = "") {
51
+ const msg = document.createElement("div");
52
+ msg.classList.add(sender === "user" ? "user-msg" : "bot-msg");
53
+ if (extraClass) msg.classList.add(extraClass);
54
+ msg.textContent = text;
55
+ chatBody.appendChild(msg);
56
+ chatBody.scrollTop = chatBody.scrollHeight;
57
+ return msg;
58
+ }
59
+
60
+ async function sendMessage(query) {
61
+ appendMessage("user", query);
62
+ userInput.value = "";
63
+
64
+ const thinkingMsg = appendMessage("bot", "🤔 Thinking...", "thinking");
65
+
66
+ try {
67
+ const res = await fetch("/predict", {
68
+ method: "POST",
69
+ headers: { "Content-Type": "application/json" },
70
+ body: JSON.stringify({ query }),
71
+ });
72
+
73
+ const data = await res.json();
74
+
75
+ chatBody.removeChild(thinkingMsg);
76
+
77
+ appendMessage("bot", data.response);
78
+ } catch (error) {
79
+ chatBody.removeChild(thinkingMsg);
80
+ appendMessage("bot", "⚠️ Error fetching response.");
81
+ }
82
+ }
83
+
84
+ sendBtn.addEventListener("click", () => {
85
+ if (userInput.value.trim()) {
86
+ sendMessage(userInput.value.trim());
87
+ }
88
+ });
89
+
90
+ userInput.addEventListener("keypress", (e) => {
91
+ if (e.key === "Enter" && userInput.value.trim()) {
92
+ sendMessage(userInput.value.trim());
93
+ }
94
+ });
95
+
96
+ optionButtons.forEach((btn) => {
97
+ btn.addEventListener("click", () => {
98
+ const query = btn.getAttribute("data-query");
99
+ sendMessage(query);
100
+ });
101
+ });
102
+ </script>
103
+ </body>
104
+ </html>