Upload 6 files
Browse files- DVCarFraudDetection.csv +0 -0
- Dockerfile +9 -0
- RFModel.pkl +3 -0
- X_train.csv +0 -0
- app.py +690 -0
- requirements.txt +16 -0
DVCarFraudDetection.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ADD . /app
|
| 6 |
+
RUN python3 -m pip install --upgrade pip
|
| 7 |
+
RUN pip install -r requirements.txt
|
| 8 |
+
|
| 9 |
+
CMD [ "python","app.py" ]
|
RFModel.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ca585f9b657db88ecff3b835aeb36c63e36615ef8e5989069450a3c65bde044
|
| 3 |
+
size 41657465
|
X_train.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, render_template, request
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from joblib import load
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
import io
|
| 7 |
+
from wordcloud import WordCloud
|
| 8 |
+
import base64
|
| 9 |
+
import string
|
| 10 |
+
import nltk
|
| 11 |
+
from nltk.corpus import stopwords
|
| 12 |
+
from nltk.tokenize import word_tokenize
|
| 13 |
+
from nltk.stem import WordNetLemmatizer
|
| 14 |
+
from google_play_scraper import app, Sort, reviews_all
|
| 15 |
+
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
| 16 |
+
from nltk.corpus import stopwords
|
| 17 |
+
from collections import Counter
|
| 18 |
+
from matplotlib.sankey import Sankey
|
| 19 |
+
import networkx as nx
|
| 20 |
+
|
| 21 |
+
app = Flask(__name__)
|
| 22 |
+
|
| 23 |
+
def preprocess_text(text):
|
| 24 |
+
if text is not None:
|
| 25 |
+
# Convert to lowercase
|
| 26 |
+
text = text.lower()
|
| 27 |
+
# Remove special characters and punctuation
|
| 28 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
| 29 |
+
# Tokenize text
|
| 30 |
+
tokens = word_tokenize(text)
|
| 31 |
+
# Remove stopwords
|
| 32 |
+
stop_words = set(stopwords.words('english'))
|
| 33 |
+
tokens = [word for word in tokens if word not in stop_words]
|
| 34 |
+
# Lemmatize tokens
|
| 35 |
+
lemmatizer = WordNetLemmatizer()
|
| 36 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
| 37 |
+
# Join tokens back into string
|
| 38 |
+
preprocessed_text = ' '.join(tokens)
|
| 39 |
+
return preprocessed_text
|
| 40 |
+
else:
|
| 41 |
+
return ''
|
| 42 |
+
|
| 43 |
+
def preprocess_dataframe(df):
|
| 44 |
+
# Drop unnecessary columns
|
| 45 |
+
df.drop(['userName', 'reviewId', 'userImage', 'reviewCreatedVersion', 'at'], axis=1, inplace=True)
|
| 46 |
+
|
| 47 |
+
# Convert 'repliedAt' column to datetime
|
| 48 |
+
df['repliedAt'] = pd.to_datetime(df['repliedAt'])
|
| 49 |
+
|
| 50 |
+
# Extract month and year from 'repliedAt'
|
| 51 |
+
df['RepliedMonth'] = df['repliedAt'].dt.month
|
| 52 |
+
df['RepliedYear'] = df['repliedAt'].dt.year
|
| 53 |
+
|
| 54 |
+
# Drop the original 'repliedAt' column
|
| 55 |
+
df.drop('repliedAt', axis=1, inplace=True)
|
| 56 |
+
|
| 57 |
+
# Convert 'replyContent' to binary indicator
|
| 58 |
+
df['IsReplied'] = df['replyContent'].apply(lambda x: 'Yes' if x and x.strip() != '' else 'No')
|
| 59 |
+
|
| 60 |
+
# Drop 'replyContent' column
|
| 61 |
+
df.drop('replyContent', axis=1, inplace=True)
|
| 62 |
+
|
| 63 |
+
# Fill missing values in 'appVersion' with '0'
|
| 64 |
+
df['appVersion'].fillna('0', inplace=True)
|
| 65 |
+
|
| 66 |
+
# Only keep necessary columns (content, score, IsReplied)
|
| 67 |
+
df = df[['content', 'score', 'IsReplied']]
|
| 68 |
+
|
| 69 |
+
return df
|
| 70 |
+
|
| 71 |
+
def analyze_sentiment(text, score):
|
| 72 |
+
# Initialize VADER sentiment analyzer
|
| 73 |
+
analyzer = SentimentIntensityAnalyzer()
|
| 74 |
+
# Perform sentiment analysis
|
| 75 |
+
sentiment_score = analyzer.polarity_scores(text)['compound']
|
| 76 |
+
|
| 77 |
+
if sentiment_score >= 0.05 and score >= 3:
|
| 78 |
+
return 'positive'
|
| 79 |
+
elif sentiment_score <= -0.05 and score < 3:
|
| 80 |
+
return 'negative'
|
| 81 |
+
else:
|
| 82 |
+
return 'neutral'
|
| 83 |
+
|
| 84 |
+
@app.route('/predict/app', methods=['POST'])
|
| 85 |
+
def predict_appFraud():
|
| 86 |
+
# Get the app ID and other necessary data from the form
|
| 87 |
+
app_id = request.form['app-id']
|
| 88 |
+
app_name = request.form['app-name']
|
| 89 |
+
|
| 90 |
+
# Scrape reviews for the specified app
|
| 91 |
+
reviews = reviews_all(app_id, sleep_milliseconds=0, lang="Eng", country="in", sort=Sort.NEWEST)
|
| 92 |
+
df = pd.json_normalize(reviews)
|
| 93 |
+
|
| 94 |
+
# Preprocess the DataFrame
|
| 95 |
+
df = preprocess_dataframe(df)
|
| 96 |
+
|
| 97 |
+
# Perform sentiment analysis
|
| 98 |
+
df['sentiment'] = df.apply(lambda row: analyze_sentiment(row['content'], row['score']), axis=1)
|
| 99 |
+
# Generate result based on sentiment
|
| 100 |
+
positive_count = (df['sentiment'] == 'positive').sum()
|
| 101 |
+
negative_count = (df['sentiment'] == 'negative').sum()
|
| 102 |
+
|
| 103 |
+
if positive_count > negative_count:
|
| 104 |
+
result = "The App is Not Fraud"
|
| 105 |
+
else:
|
| 106 |
+
result = "The App is Fraud"
|
| 107 |
+
|
| 108 |
+
total_reviews = len(df)
|
| 109 |
+
positive_reviews = (df['sentiment'] == 'positive').sum()
|
| 110 |
+
negative_reviews = (df['sentiment'] == 'negative').sum()
|
| 111 |
+
neutral_reviews = (df['sentiment'] == 'neutral').sum()
|
| 112 |
+
average_rating = round(df['score'].mean(), 2)
|
| 113 |
+
positive_percentage = round((positive_reviews / total_reviews) * 100, 2)
|
| 114 |
+
negative_percentage = round((negative_reviews / total_reviews) * 100, 2)
|
| 115 |
+
neutral_percentage = round((neutral_reviews / total_reviews) * 100, 2)
|
| 116 |
+
replied_percentage = round((df['IsReplied'] == 'Yes').mean() * 100, 2)
|
| 117 |
+
|
| 118 |
+
# Generate visualizations
|
| 119 |
+
# 1. Percentage pie chart of reviews
|
| 120 |
+
reviews_counts = df['sentiment'].value_counts()
|
| 121 |
+
labels = reviews_counts.index
|
| 122 |
+
colors = ['red', 'green', 'blue']
|
| 123 |
+
plt.figure(figsize=(6, 4))
|
| 124 |
+
plt.pie(reviews_counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
|
| 125 |
+
plt.title('Percentage of Reviews in Fraud App')
|
| 126 |
+
buffer1 = io.BytesIO()
|
| 127 |
+
plt.savefig(buffer1, format='png')
|
| 128 |
+
buffer1.seek(0)
|
| 129 |
+
buffer_data1 = base64.b64encode(buffer1.getvalue()).decode()
|
| 130 |
+
plt.close()
|
| 131 |
+
|
| 132 |
+
# 2. Count plot of each type of review
|
| 133 |
+
plt.figure(figsize=(6, 4))
|
| 134 |
+
sns.countplot(x='sentiment', data=df, palette={'positive': 'green', 'negative': 'red', 'neutral': 'blue'})
|
| 135 |
+
plt.title('Count of Each Review Type in Fraud App')
|
| 136 |
+
plt.xlabel('Sentiment')
|
| 137 |
+
plt.ylabel('Count')
|
| 138 |
+
buffer2 = io.BytesIO()
|
| 139 |
+
plt.savefig(buffer2, format='png')
|
| 140 |
+
buffer2.seek(0)
|
| 141 |
+
buffer_data2 = base64.b64encode(buffer2.getvalue()).decode()
|
| 142 |
+
plt.close()
|
| 143 |
+
|
| 144 |
+
# 3. Histogram for each type of score
|
| 145 |
+
plt.figure(figsize=(6, 4))
|
| 146 |
+
sns.histplot(data=df, x='score', hue='sentiment', multiple='stack', bins=20)
|
| 147 |
+
plt.title('Histogram of Rating for Each Review Type in Fraud App')
|
| 148 |
+
plt.xlabel('Score')
|
| 149 |
+
plt.ylabel('Count')
|
| 150 |
+
buffer3 = io.BytesIO()
|
| 151 |
+
plt.savefig(buffer3, format='png')
|
| 152 |
+
buffer3.seek(0)
|
| 153 |
+
buffer_data3 = base64.b64encode(buffer3.getvalue()).decode()
|
| 154 |
+
plt.close()
|
| 155 |
+
|
| 156 |
+
# 4. Pie chart of isreplied (Yes vs No)
|
| 157 |
+
replied_counts = df['IsReplied'].value_counts()
|
| 158 |
+
labels = replied_counts.index
|
| 159 |
+
plt.figure(figsize=(6, 4))
|
| 160 |
+
plt.pie(replied_counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=['lightgreen', 'lightcoral'])
|
| 161 |
+
plt.title('Percentage of Replies in Fraud App Reviews')
|
| 162 |
+
buffer4 = io.BytesIO()
|
| 163 |
+
plt.savefig(buffer4, format='png')
|
| 164 |
+
buffer4.seek(0)
|
| 165 |
+
buffer_data4 = base64.b64encode(buffer4.getvalue()).decode()
|
| 166 |
+
plt.close()
|
| 167 |
+
|
| 168 |
+
# 5. Violin plot of review vs score
|
| 169 |
+
plt.figure(figsize=(6, 4))
|
| 170 |
+
sns.violinplot(x='sentiment', y='score', data=df, palette={'positive': 'green', 'negative': 'red', 'neutral': 'blue'})
|
| 171 |
+
plt.title('Violin Plot of Review vs Rating in Fraud App')
|
| 172 |
+
plt.xlabel('Sentiment')
|
| 173 |
+
plt.ylabel('Score')
|
| 174 |
+
buffer5 = io.BytesIO()
|
| 175 |
+
plt.savefig(buffer5, format='png')
|
| 176 |
+
buffer5.seek(0)
|
| 177 |
+
buffer_data5 = base64.b64encode(buffer5.getvalue()).decode()
|
| 178 |
+
plt.close()
|
| 179 |
+
|
| 180 |
+
# 6. Joint count plot for positive, negative, and neutral reviews based on isreplied (Yes or No)
|
| 181 |
+
plt.figure(figsize=(6, 4)) # Set the size of the figure
|
| 182 |
+
sns.catplot(x='sentiment', kind='count', hue='IsReplied', data=df, palette='Set1',height=4,aspect=1)
|
| 183 |
+
plt.title('Sentiments vs Review Reply Status')
|
| 184 |
+
plt.xlabel('Sentiment')
|
| 185 |
+
plt.ylabel('Count')
|
| 186 |
+
plt.tight_layout()
|
| 187 |
+
buffer6 = io.BytesIO()
|
| 188 |
+
plt.savefig(buffer6, format='png')
|
| 189 |
+
buffer6.seek(0)
|
| 190 |
+
buffer_data6 = base64.b64encode(buffer6.getvalue()).decode()
|
| 191 |
+
plt.close()
|
| 192 |
+
|
| 193 |
+
# Render template with result and any other data you want to display
|
| 194 |
+
return render_template('app_result.html', result=result, app_name=app_name,
|
| 195 |
+
total_reviews=total_reviews, positive_reviews=positive_reviews,
|
| 196 |
+
negative_reviews=negative_reviews, neutral_reviews=neutral_reviews,
|
| 197 |
+
average_rating=average_rating, positive_percentage=positive_percentage,
|
| 198 |
+
negative_percentage=negative_percentage, neutral_percentage=neutral_percentage, replied_percentage=replied_percentage, plot1=buffer_data1, plot2=buffer_data2,
|
| 199 |
+
plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6)
|
| 200 |
+
|
| 201 |
+
# Load the pre-trained model
|
| 202 |
+
best_rf_classifier = load('RFModel.pkl')
|
| 203 |
+
|
| 204 |
+
# Load X_train
|
| 205 |
+
X_train = pd.read_csv('X_train.csv')
|
| 206 |
+
|
| 207 |
+
# Load the dataset
|
| 208 |
+
df = pd.read_csv('DVCarFraudDetection.csv')
|
| 209 |
+
|
| 210 |
+
@app.route('/')
|
| 211 |
+
def index():
|
| 212 |
+
return render_template('index.html')
|
| 213 |
+
|
| 214 |
+
@app.route('/vehicle_insurance')
|
| 215 |
+
def vehicle_insurance():
|
| 216 |
+
return render_template('vehicle.html')
|
| 217 |
+
|
| 218 |
+
@app.route('/predict/insurance')
|
| 219 |
+
def predict_insurance():
|
| 220 |
+
return render_template('vehicle.html')
|
| 221 |
+
|
| 222 |
+
@app.route('/dataset')
|
| 223 |
+
def dataset_display():
|
| 224 |
+
# Generate visualizations
|
| 225 |
+
fig1, ax1 = plt.subplots(figsize=(6, 4))
|
| 226 |
+
sns.countplot(y='CarCompany', data=df)
|
| 227 |
+
buffer1 = io.BytesIO()
|
| 228 |
+
plt.savefig(buffer1, format='png')
|
| 229 |
+
buffer1.seek(0)
|
| 230 |
+
buffer_data1 = base64.b64encode(buffer1.getvalue()).decode()
|
| 231 |
+
plt.close(fig1)
|
| 232 |
+
|
| 233 |
+
fig2, ax2 = plt.subplots(figsize=(6, 4))
|
| 234 |
+
sns.countplot(x='BasePolicy', hue='IsFraud', data=df, palette={0: 'green', 1: 'red'})
|
| 235 |
+
buffer2 = io.BytesIO()
|
| 236 |
+
plt.savefig(buffer2, format='png')
|
| 237 |
+
buffer2.seek(0)
|
| 238 |
+
buffer_data2 = base64.b64encode(buffer2.getvalue()).decode()
|
| 239 |
+
plt.close(fig2)
|
| 240 |
+
|
| 241 |
+
fig3, ax3 = plt.subplots(figsize=(6, 4))
|
| 242 |
+
past_claims_counts = df['PastNumberOfClaims'].value_counts()
|
| 243 |
+
ax3.pie(past_claims_counts, labels=past_claims_counts.index, autopct='%1.1f%%')
|
| 244 |
+
ax3.set_title('Past Number of Claims Count')
|
| 245 |
+
buffer3 = io.BytesIO()
|
| 246 |
+
plt.savefig(buffer3, format='png')
|
| 247 |
+
buffer3.seek(0)
|
| 248 |
+
buffer_data3 = base64.b64encode(buffer3.getvalue()).decode()
|
| 249 |
+
plt.close(fig3)
|
| 250 |
+
|
| 251 |
+
fig4, ax4 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 252 |
+
sns.countplot(x='IsAddressChanged', hue='IsFraud', data=df, palette={0: 'green', 1: 'red'})
|
| 253 |
+
ax4.set_title('Address Change and Fraud Distribution')
|
| 254 |
+
ax4.set_xlabel('Is Address Changed?')
|
| 255 |
+
ax4.set_ylabel('Count')
|
| 256 |
+
plt.legend(title='Is Fraud')
|
| 257 |
+
buffer4 = io.BytesIO()
|
| 258 |
+
plt.savefig(buffer4, format='png')
|
| 259 |
+
buffer4.seek(0)
|
| 260 |
+
buffer_data4 = base64.b64encode(buffer4.getvalue()).decode()
|
| 261 |
+
plt.close(fig4)
|
| 262 |
+
|
| 263 |
+
fig5, ax5 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 264 |
+
heatmap_data = df.groupby(['CarCompany', 'OwnerGender']).size().unstack()
|
| 265 |
+
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', ax=ax5)
|
| 266 |
+
ax5.set_title('Car Company vs Owner Gender')
|
| 267 |
+
ax5.set_xlabel('Owner Gender')
|
| 268 |
+
ax5.set_ylabel('Car Company')
|
| 269 |
+
plt.yticks(rotation=0) # Rotate y-axis labels for better readability
|
| 270 |
+
plt.tight_layout()
|
| 271 |
+
buffer5 = io.BytesIO()
|
| 272 |
+
plt.savefig(buffer5, format='png')
|
| 273 |
+
buffer5.seek(0)
|
| 274 |
+
buffer_data5 = base64.b64encode(buffer5.getvalue()).decode()
|
| 275 |
+
plt.close(fig5)
|
| 276 |
+
|
| 277 |
+
fig6, ax6 = plt.subplots(figsize=(6, 4))
|
| 278 |
+
num_supplements_counts = df['NumberOfSuppliments'].value_counts()
|
| 279 |
+
ax6.pie(num_supplements_counts, labels=num_supplements_counts.index, autopct='%1.1f%%')
|
| 280 |
+
ax6.set_title('NUmber of Suplements Count')
|
| 281 |
+
buffer6 = io.BytesIO()
|
| 282 |
+
plt.savefig(buffer6, format='png')
|
| 283 |
+
buffer6.seek(0)
|
| 284 |
+
buffer_data6 = base64.b64encode(buffer6.getvalue()).decode()
|
| 285 |
+
plt.close(fig6)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
fig7, ax7 = plt.subplots(figsize=(6, 4))
|
| 289 |
+
sns.countplot(x='PoliceReportFiled', hue='IsFraud', data=df)
|
| 290 |
+
buffer7 = io.BytesIO()
|
| 291 |
+
plt.savefig(buffer7, format='png')
|
| 292 |
+
buffer7.seek(0)
|
| 293 |
+
buffer_data7 = base64.b64encode(buffer7.getvalue()).decode()
|
| 294 |
+
plt.close(fig7)
|
| 295 |
+
|
| 296 |
+
fig8, ax8 = plt.subplots(figsize=(6, 4))
|
| 297 |
+
sns.violinplot(x='OwnerGender', y='OwnerAge', data=df, palette={'Male': 'blue', 'Female': 'pink'}, ax=ax8)
|
| 298 |
+
buffer8 = io.BytesIO()
|
| 299 |
+
plt.savefig(buffer8, format='png')
|
| 300 |
+
buffer8.seek(0)
|
| 301 |
+
buffer_data8 = base64.b64encode(buffer8.getvalue()).decode()
|
| 302 |
+
plt.close(fig8)
|
| 303 |
+
|
| 304 |
+
fig9, ax9 = plt.subplots(figsize=(6, 4)) # Create a new figure and axis
|
| 305 |
+
sns.scatterplot(x='OwnerAge', y='NumberOfSuppliments', data=df, ax=ax9)
|
| 306 |
+
plt.title('Scatter Plot of OwnerAge vs NumberOfSuppliments') # Set the title of the plot
|
| 307 |
+
plt.tight_layout() # Ensure tight layout
|
| 308 |
+
buffer9 = io.BytesIO() # Create a BytesIO buffer to store the plot image
|
| 309 |
+
plt.savefig(buffer9, format='png') # Save the plot to the buffer in PNG format
|
| 310 |
+
buffer9.seek(0) # Reset the buffer position to the start
|
| 311 |
+
buffer_data9 = base64.b64encode(buffer9.getvalue()).decode() # Encode the plot image as base64
|
| 312 |
+
plt.close(fig9) # Close the figure to release resources
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
fig10, ax10 = plt.subplots(figsize=(6, 4))
|
| 316 |
+
sns.boxplot(x='CarCategory', y='CarPrice', data=df, ax=ax10)
|
| 317 |
+
buffer10 = io.BytesIO()
|
| 318 |
+
plt.savefig(buffer10, format='png')
|
| 319 |
+
buffer10.seek(0)
|
| 320 |
+
buffer_data10 = base64.b64encode(buffer10.getvalue()).decode()
|
| 321 |
+
plt.close(fig10)
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
# Render the dataset template with plots
|
| 325 |
+
return render_template('dataset.html', df=pd.read_csv('env\DVCarFraudDetection.csv'), plot1=buffer_data1, plot2=buffer_data2,
|
| 326 |
+
plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6,
|
| 327 |
+
plot7=buffer_data7, plot8=buffer_data8, plot9=buffer_data9, plot10=buffer_data10)
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
@app.route('/predict/insurance', methods=['POST'])
|
| 331 |
+
def make_prediction():
|
| 332 |
+
# Get the form data
|
| 333 |
+
CarCompany = request.form['CarCompany']
|
| 334 |
+
AccidentArea = request.form['AccidentArea']
|
| 335 |
+
OwnerGender = request.form['OwnerGender']
|
| 336 |
+
OwnerAge = int(request.form['OwnerAge'])
|
| 337 |
+
Fault = request.form['Fault']
|
| 338 |
+
CarCategory = request.form['CarCategory']
|
| 339 |
+
CarPrice = int(request.form['CarPrice'])
|
| 340 |
+
PoliceReportFiled = request.form['PoliceReportFiled']
|
| 341 |
+
WitnessPresent = request.form['WitnessPresent']
|
| 342 |
+
AgentType = request.form['AgentType']
|
| 343 |
+
NumberOfSuppliments = int(request.form['NumberOfSuppliments'])
|
| 344 |
+
BasePolicy = request.form['BasePolicy']
|
| 345 |
+
IsAddressChanged = request.form['IsAddressChanged']
|
| 346 |
+
PastNumberOfClaims = int(request.form['PastNumberOfClaims'])
|
| 347 |
+
|
| 348 |
+
# Preprocess the input data
|
| 349 |
+
car_price = CarPrice / 10 # scaling car price as in your previous code
|
| 350 |
+
user_input = {
|
| 351 |
+
'CarCompany': [CarCompany],
|
| 352 |
+
'AccidentArea': [AccidentArea],
|
| 353 |
+
'OwnerGender': [OwnerGender],
|
| 354 |
+
'OwnerAge': [OwnerAge],
|
| 355 |
+
'Fault': [Fault],
|
| 356 |
+
'CarCategory': [CarCategory],
|
| 357 |
+
'CarPrice': [car_price],
|
| 358 |
+
'PoliceReportFiled': [PoliceReportFiled],
|
| 359 |
+
'WitnessPresent': [WitnessPresent],
|
| 360 |
+
'AgentType': [AgentType],
|
| 361 |
+
'NumberOfSuppliments': [NumberOfSuppliments],
|
| 362 |
+
'BasePolicy': [BasePolicy],
|
| 363 |
+
'IsAddressChanged': [IsAddressChanged],
|
| 364 |
+
'PastNumberOfClaims': [PastNumberOfClaims]
|
| 365 |
+
}
|
| 366 |
+
user_df = pd.DataFrame(user_input)
|
| 367 |
+
processed_user_input = pd.get_dummies(user_df)
|
| 368 |
+
# Assuming X_train is your training data, you need to replace it with your actual training data
|
| 369 |
+
processed_user_input = processed_user_input.reindex(columns=X_train.columns, fill_value=0)
|
| 370 |
+
|
| 371 |
+
# Make prediction
|
| 372 |
+
prediction = best_rf_classifier.predict(processed_user_input)
|
| 373 |
+
|
| 374 |
+
# Return prediction result
|
| 375 |
+
if prediction[0] == 1:
|
| 376 |
+
result = "Fraud in Insurance"
|
| 377 |
+
else:
|
| 378 |
+
result = "No Fraud in Insurance"
|
| 379 |
+
|
| 380 |
+
# Generate visualizations
|
| 381 |
+
fig1, ax1 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 382 |
+
sns.countplot(x='OwnerGender', hue='IsFraud', data=df, ax=ax1)
|
| 383 |
+
buffer1 = io.BytesIO()
|
| 384 |
+
plt.savefig(buffer1, format='png')
|
| 385 |
+
buffer1.seek(0)
|
| 386 |
+
buffer_data1 = base64.b64encode(buffer1.getvalue()).decode()
|
| 387 |
+
plt.close(fig1)
|
| 388 |
+
|
| 389 |
+
fig2, ax2 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 390 |
+
sns.violinplot(x='CarCategory', y='CarPrice', data=df, ax=ax2)
|
| 391 |
+
buffer2 = io.BytesIO()
|
| 392 |
+
plt.savefig(buffer2, format='png')
|
| 393 |
+
buffer2.seek(0)
|
| 394 |
+
buffer_data2 = base64.b64encode(buffer2.getvalue()).decode()
|
| 395 |
+
plt.close(fig2)
|
| 396 |
+
|
| 397 |
+
fig3, ax3 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 398 |
+
sns.countplot(x='AgentType', hue='IsFraud', data=df, ax=ax3)
|
| 399 |
+
buffer3 = io.BytesIO()
|
| 400 |
+
plt.savefig(buffer3, format='png')
|
| 401 |
+
buffer3.seek(0)
|
| 402 |
+
buffer_data3 = base64.b64encode(buffer3.getvalue()).decode()
|
| 403 |
+
plt.close(fig3)
|
| 404 |
+
|
| 405 |
+
fig4, ax4 = plt.subplots(figsize=(6 , 4)) # Adjust the figsize as per your preference
|
| 406 |
+
policy_fraud_counts = df[df['IsFraud'] == 1]['BasePolicy'].value_counts()
|
| 407 |
+
ax4.pie(policy_fraud_counts, labels=policy_fraud_counts.index, autopct='%1.1f%%')
|
| 408 |
+
buffer4 = io.BytesIO()
|
| 409 |
+
plt.savefig(buffer4, format='png')
|
| 410 |
+
buffer4.seek(0)
|
| 411 |
+
buffer_data4 = base64.b64encode(buffer4.getvalue()).decode()
|
| 412 |
+
plt.close(fig4)
|
| 413 |
+
|
| 414 |
+
fig5, ax5 = plt.subplots(figsize=(6, 4))
|
| 415 |
+
fraud_data = df[df['IsFraud'] == 1]
|
| 416 |
+
non_fraud_data = df[df['IsFraud'] == 0]
|
| 417 |
+
sns.boxplot(x='IsFraud', y='CarPrice', data=fraud_data, ax=ax5)
|
| 418 |
+
sns.boxplot(x='IsFraud', y='CarPrice', data=non_fraud_data, ax=ax5)
|
| 419 |
+
ax5.set_xlabel('Fraud Status')
|
| 420 |
+
ax5.set_ylabel('Car Price')
|
| 421 |
+
ax5.set_title('Box Plot of Car Price for Fraud and Non-Fraud Cases')
|
| 422 |
+
handles, labels = ax5.get_legend_handles_labels()
|
| 423 |
+
ax5.legend(handles, labels)
|
| 424 |
+
buffer5 = io.BytesIO()
|
| 425 |
+
plt.savefig(buffer5, format='png')
|
| 426 |
+
buffer5.seek(0)
|
| 427 |
+
buffer_data5 = base64.b64encode(buffer5.getvalue()).decode()
|
| 428 |
+
plt.close(fig5)
|
| 429 |
+
|
| 430 |
+
fig6, ax6 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 431 |
+
sns.histplot(data=df, x='PastNumberOfClaims', bins=range(max(df['PastNumberOfClaims'])+2), kde=False, ax=ax6)
|
| 432 |
+
ax6.set_ylabel('Fraud cases count')
|
| 433 |
+
buffer6 = io.BytesIO()
|
| 434 |
+
plt.savefig(buffer6, format='png')
|
| 435 |
+
buffer6.seek(0)
|
| 436 |
+
buffer_data6 = base64.b64encode(buffer6.getvalue()).decode()
|
| 437 |
+
plt.close(fig6)
|
| 438 |
+
|
| 439 |
+
fig7, ax7 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 440 |
+
policy_fraud_counts = df[df['IsFraud'] == 1]['CarCategory'].value_counts()
|
| 441 |
+
ax7.pie(policy_fraud_counts, labels=policy_fraud_counts.index, autopct='%1.1f%%')
|
| 442 |
+
buffer7 = io.BytesIO()
|
| 443 |
+
plt.savefig(buffer7, format='png')
|
| 444 |
+
buffer7.seek(0)
|
| 445 |
+
buffer_data7 = base64.b64encode(buffer7.getvalue()).decode()
|
| 446 |
+
plt.close(fig7)
|
| 447 |
+
|
| 448 |
+
fig8, ax8 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
|
| 449 |
+
sns.countplot(x='PastNumberOfClaims', hue='IsFraud', data=df, ax=ax8)
|
| 450 |
+
buffer8 = io.BytesIO()
|
| 451 |
+
plt.savefig(buffer8, format='png')
|
| 452 |
+
buffer8.seek(0)
|
| 453 |
+
buffer_data8 = base64.b64encode(buffer8.getvalue()).decode()
|
| 454 |
+
plt.close(fig8)
|
| 455 |
+
|
| 456 |
+
# Return prediction result and base64 encoded images
|
| 457 |
+
return render_template('prediction_result.html', result=result, plot1=buffer_data1, plot2=buffer_data2,
|
| 458 |
+
plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6,
|
| 459 |
+
plot7=buffer_data7, plot8=buffer_data8)
|
| 460 |
+
|
| 461 |
+
@app.route("/predict/app")
|
| 462 |
+
def predict_app():
|
| 463 |
+
return render_template('fraudapp.html')
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
@app.route("/mobile_app")
|
| 467 |
+
def mobile_app():
|
| 468 |
+
return render_template('fraudapp.html')
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
@app.route('/analysis/app')
|
| 472 |
+
def analysis_app():
|
| 473 |
+
return render_template('app_analysis.html')
|
| 474 |
+
|
| 475 |
+
@app.route('/analysis/app', methods=['POST'])
|
| 476 |
+
def analysisresult_app():
|
| 477 |
+
app_id = request.form['app-id']
|
| 478 |
+
app_name = request.form['app-name']
|
| 479 |
+
|
| 480 |
+
# Scrape reviews for the specified app
|
| 481 |
+
reviews = reviews_all(app_id, sleep_milliseconds=0, lang="Eng", country="in", sort=Sort.NEWEST)
|
| 482 |
+
df = pd.json_normalize(reviews)
|
| 483 |
+
|
| 484 |
+
# Preprocess the DataFrame
|
| 485 |
+
df = preprocess_dataframe(df)
|
| 486 |
+
|
| 487 |
+
# Perform sentiment analysis
|
| 488 |
+
df['sentiment'] = df.apply(lambda row: analyze_sentiment(row['content'], row['score']), axis=1)
|
| 489 |
+
|
| 490 |
+
# Word Cloud
|
| 491 |
+
text = ' '.join(df['content'].astype(str).tolist())
|
| 492 |
+
wordcloud = WordCloud(width=600, height=400, background_color='white').generate(text)
|
| 493 |
+
img_buffer1 = save_wordcloud_to_buffer(wordcloud)
|
| 494 |
+
|
| 495 |
+
stop_words = set(stopwords.words('english'))
|
| 496 |
+
|
| 497 |
+
# Add more words if necessary
|
| 498 |
+
additional_stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])
|
| 499 |
+
stop_words.update(additional_stopwords)
|
| 500 |
+
|
| 501 |
+
# Count Plot of 10 Most Repeated Proper Nouns
|
| 502 |
+
proper_nouns = []
|
| 503 |
+
for review in df['content']:
|
| 504 |
+
words = review.split()
|
| 505 |
+
for word in words:
|
| 506 |
+
if word.istitle() and word.isalpha() and word.lower() not in stop_words:
|
| 507 |
+
proper_nouns.append(word)
|
| 508 |
+
top_proper_nouns = Counter(proper_nouns).most_common(10)
|
| 509 |
+
fig2, ax2 = plt.subplots(figsize=(6, 4))
|
| 510 |
+
sns.countplot(y=proper_nouns, order=[word[0] for word in top_proper_nouns], palette='viridis', ax=ax2)
|
| 511 |
+
ax2.set_title('Count Plot of 10 Most Repeated Proper Nouns')
|
| 512 |
+
ax2.set_xlabel('Count')
|
| 513 |
+
buffer2 = save_plot_to_buffer(fig2)
|
| 514 |
+
|
| 515 |
+
fig3, ax3 = plt.subplots(figsize=(6, 4))
|
| 516 |
+
is_replied_no_df = df[df['IsReplied'] == 'No']
|
| 517 |
+
sentiment_counts = is_replied_no_df['sentiment'].value_counts()
|
| 518 |
+
ax3.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['green', 'red', 'blue'])
|
| 519 |
+
ax3.set_title('Pie Chart of Sentiment Distribution for IsReplied NO')
|
| 520 |
+
buffer3 = save_plot_to_buffer(fig3)
|
| 521 |
+
|
| 522 |
+
# Calculate Review Length
|
| 523 |
+
df['review_length'] = df['content'].apply(lambda x: len(x.split()))
|
| 524 |
+
|
| 525 |
+
# Create a pivot table to aggregate sentiment scores by review length
|
| 526 |
+
sentiment_distribution = df.pivot_table(index='review_length', columns='sentiment', values='score', aggfunc='mean')
|
| 527 |
+
|
| 528 |
+
# Plot the heatmap
|
| 529 |
+
fig4, ax4 = plt.subplots(figsize=(6, 4))
|
| 530 |
+
sns.heatmap(sentiment_distribution, cmap='YlGnBu', linewidths=0.5, ax=ax4)
|
| 531 |
+
ax4.set_title('Sentiment Distribution Heatmap')
|
| 532 |
+
ax4.set_xlabel('Sentiment')
|
| 533 |
+
ax4.set_ylabel('Review Length')
|
| 534 |
+
|
| 535 |
+
# Save the plot to buffer
|
| 536 |
+
buffer4 = save_plot_to_buffer(fig4)
|
| 537 |
+
|
| 538 |
+
# Heatmap of Word Frequency
|
| 539 |
+
word_lengths = df['content'].apply(lambda x: len(x.split()))
|
| 540 |
+
word_freq = pd.DataFrame({'Word Length': word_lengths, 'Rating': df['score']})
|
| 541 |
+
|
| 542 |
+
fig5, ax5 = plt.subplots(figsize=(6, 4))
|
| 543 |
+
sns.heatmap(word_freq.corr(), annot=True, cmap='coolwarm', ax=ax5)
|
| 544 |
+
ax5.set_title('Heatmap of Word Length vs Rating')
|
| 545 |
+
buffer5 = save_plot_to_buffer(fig5)
|
| 546 |
+
|
| 547 |
+
# Joint Count Plot of Score for Positive, Negative, and Neutral
|
| 548 |
+
fig6, ax6 = plt.subplots(figsize=(6, 4))
|
| 549 |
+
sns.histplot(data=df, x='score', hue='sentiment', multiple='stack', palette='husl', ax=ax6)
|
| 550 |
+
ax6.set_title('Joint Count Plot of Score for Positive, Negative, and Neutral')
|
| 551 |
+
ax6.set_xlabel('Score')
|
| 552 |
+
ax6.set_ylabel('Count')
|
| 553 |
+
buffer6 = save_plot_to_buffer(fig6)
|
| 554 |
+
|
| 555 |
+
return render_template('app_analysis_final.html', df=df, app_name=app_name,
|
| 556 |
+
buffer1=img_buffer1, buffer2=buffer2, buffer3=buffer3,
|
| 557 |
+
buffer4=buffer4, buffer5=buffer5, buffer6=buffer6)
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
# Function to save plot to buffer
|
| 561 |
+
def save_plot_to_buffer(fig):
|
| 562 |
+
buffer = io.BytesIO()
|
| 563 |
+
fig.savefig(buffer, format='png')
|
| 564 |
+
buffer.seek(0)
|
| 565 |
+
buffer_data = base64.b64encode(buffer.getvalue()).decode()
|
| 566 |
+
plt.close(fig)
|
| 567 |
+
return buffer_data
|
| 568 |
+
|
| 569 |
+
# Function to save WordCloud image to buffer
|
| 570 |
+
def save_wordcloud_to_buffer(wordcloud):
|
| 571 |
+
img = wordcloud.to_image()
|
| 572 |
+
img_buffer = io.BytesIO()
|
| 573 |
+
img.save(img_buffer, format='PNG')
|
| 574 |
+
img_buffer.seek(0)
|
| 575 |
+
buffer = base64.b64encode(img_buffer.getvalue()).decode()
|
| 576 |
+
img_buffer.close()
|
| 577 |
+
return buffer
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
@app.route('/analysis/insurance')
|
| 581 |
+
def analysis_insurance():
|
| 582 |
+
# Generate visualizations
|
| 583 |
+
# Visualization 1: Distribution of Car Prices
|
| 584 |
+
fig1, ax1 = plt.subplots(figsize=(6, 4))
|
| 585 |
+
sns.histplot(df['CarPrice'], kde=True, color='skyblue', ax=ax1)
|
| 586 |
+
ax1.set_title('Distribution of Car Prices')
|
| 587 |
+
ax1.set_xlabel('Car Price')
|
| 588 |
+
ax1.set_ylabel('Frequency')
|
| 589 |
+
buffer1 = save_plot_to_buffer(fig1)
|
| 590 |
+
|
| 591 |
+
# Visualization 2: Distribution of Owner Ages
|
| 592 |
+
fig2, ax2 = plt.subplots(figsize=(6, 4))
|
| 593 |
+
sns.histplot(df['OwnerAge'], kde=True, color='salmon', ax=ax2)
|
| 594 |
+
ax2.set_title('Distribution of Owner Ages')
|
| 595 |
+
ax2.set_xlabel('Owner Age')
|
| 596 |
+
ax2.set_ylabel('Frequency')
|
| 597 |
+
buffer2 = save_plot_to_buffer(fig2)
|
| 598 |
+
|
| 599 |
+
# Visualization 3: Count of Claims by Base Policy
|
| 600 |
+
fig3, ax3 = plt.subplots(figsize=(6, 4))
|
| 601 |
+
sns.countplot(x='CarCategory', hue='IsFraud', data=df, palette='coolwarm', ax=ax3)
|
| 602 |
+
ax3.set_title('Count of Claims by Car category')
|
| 603 |
+
ax3.set_xlabel('Car category')
|
| 604 |
+
ax3.set_ylabel('Count')
|
| 605 |
+
buffer3 = save_plot_to_buffer(fig3)
|
| 606 |
+
|
| 607 |
+
# Visualization 4: Distribution of Car Prices by Fraud Status
|
| 608 |
+
fig4, ax4 = plt.subplots(figsize=(6, 4))
|
| 609 |
+
sns.boxplot(x='IsFraud', y='CarPrice', data=df, palette='Set2', ax=ax4)
|
| 610 |
+
ax4.set_title('Distribution of Car Prices by Fraud Status')
|
| 611 |
+
ax4.set_xlabel('Fraud Status')
|
| 612 |
+
ax4.set_ylabel('Car Price')
|
| 613 |
+
buffer4 = save_plot_to_buffer(fig4)
|
| 614 |
+
|
| 615 |
+
# Visualization 5: Count of Claims by Accident Area
|
| 616 |
+
fig5, ax5 = plt.subplots(figsize=(6, 4))
|
| 617 |
+
sns.countplot(x='AccidentArea', hue='IsFraud', data=df, palette='husl', ax=ax5)
|
| 618 |
+
ax5.set_title('Count of Claims by Accident Area')
|
| 619 |
+
ax5.set_xlabel('Accident Area')
|
| 620 |
+
ax5.set_ylabel('Count')
|
| 621 |
+
buffer5 = save_plot_to_buffer(fig5)
|
| 622 |
+
|
| 623 |
+
# Visualization 6: Distribution of Number of Supplements
|
| 624 |
+
fig6, ax6 = plt.subplots(figsize=(6, 4))
|
| 625 |
+
sns.histplot(df['NumberOfSuppliments'], kde=True, color='orange', ax=ax6)
|
| 626 |
+
ax6.set_title('Distribution of Number of Supplements')
|
| 627 |
+
ax6.set_xlabel('Number of Supplements')
|
| 628 |
+
ax6.set_ylabel('Frequency')
|
| 629 |
+
buffer6 = save_plot_to_buffer(fig6)
|
| 630 |
+
|
| 631 |
+
# Visualization 7: Count of Claims by Witness Presence
|
| 632 |
+
fig7, ax7 = plt.subplots(figsize=(6, 4))
|
| 633 |
+
sns.countplot(x='WitnessPresent', hue='IsFraud', data=df, palette='viridis', ax=ax7)
|
| 634 |
+
ax7.set_title('Count of Claims by Witness Presence')
|
| 635 |
+
ax7.set_xlabel('Witness Presence')
|
| 636 |
+
ax7.set_ylabel('Count')
|
| 637 |
+
buffer7 = save_plot_to_buffer(fig7)
|
| 638 |
+
|
| 639 |
+
# Visualization 8: Distribution of Past Number of Claims
|
| 640 |
+
fig8, ax8 = plt.subplots(figsize=(6, 4))
|
| 641 |
+
sns.histplot(df['PastNumberOfClaims'], kde=True, color='purple', ax=ax8)
|
| 642 |
+
ax8.set_title('Distribution of Past Number of Claims')
|
| 643 |
+
ax8.set_xlabel('Past Number of Claims')
|
| 644 |
+
ax8.set_ylabel('Frequency')
|
| 645 |
+
buffer8 = save_plot_to_buffer(fig8)
|
| 646 |
+
|
| 647 |
+
numeric_columns = df.select_dtypes(include='number')
|
| 648 |
+
|
| 649 |
+
# Compute the correlation matrix
|
| 650 |
+
corr = numeric_columns.corr()
|
| 651 |
+
|
| 652 |
+
# Create the heatmap
|
| 653 |
+
fig9, ax9 = plt.subplots(figsize=(6.5, 4.5))
|
| 654 |
+
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", ax=ax9)
|
| 655 |
+
ax9.set_title('Heatmap of Correlation Matrix')
|
| 656 |
+
buffer9 = save_plot_to_buffer(fig9)
|
| 657 |
+
|
| 658 |
+
# Visualization 10: Network Graph of Car Brands and Fraud Status
|
| 659 |
+
fig10, ax10 = plt.subplots(figsize=(6, 4))
|
| 660 |
+
G = nx.from_pandas_edgelist(df, 'CarCompany', 'IsFraud')
|
| 661 |
+
nx.draw(G, with_labels=True, node_color='skyblue', node_size=2000, font_size=10, ax=ax10)
|
| 662 |
+
ax10.set_title('Network Graph of Car Brands and Fraud Status')
|
| 663 |
+
buffer10 = save_plot_to_buffer(fig10)
|
| 664 |
+
|
| 665 |
+
# Visualization 11: Violin Plot of Accident Area and Car Price
|
| 666 |
+
fig11, ax11 = plt.subplots(figsize=(6, 4))
|
| 667 |
+
sns.violinplot(x='AccidentArea', y='CarPrice', data=df, hue='IsFraud', split=True, palette='husl', ax=ax11)
|
| 668 |
+
ax11.set_title('Violin Plot of Accident Area and Car Price')
|
| 669 |
+
buffer11 = save_plot_to_buffer(fig11)
|
| 670 |
+
|
| 671 |
+
fig12, ax12 = plt.subplots(figsize=(6, 4))
|
| 672 |
+
hb = ax12.hexbin(df['CarPrice'], df['OwnerAge'], gridsize=50, cmap='inferno')
|
| 673 |
+
ax12.set_title('Hexbin Plot of Car Prices and Owner Ages')
|
| 674 |
+
ax12.set_xlabel('Car Price')
|
| 675 |
+
ax12.set_ylabel('Owner Age')
|
| 676 |
+
cb = fig12.colorbar(hb, ax=ax12)
|
| 677 |
+
cb.set_label('Frequency')
|
| 678 |
+
buffer12 = save_plot_to_buffer(fig12)
|
| 679 |
+
|
| 680 |
+
# Return render template with the additional plots
|
| 681 |
+
return render_template('insurance_analysis.html', plot1=buffer1, plot2=buffer2,
|
| 682 |
+
plot3=buffer3, plot4=buffer4, plot5=buffer5, plot6=buffer6,
|
| 683 |
+
plot7=buffer7, plot8=buffer8, plot9=buffer9, plot10=buffer10,
|
| 684 |
+
plot11=buffer11, plot12=buffer12)
|
| 685 |
+
|
| 686 |
+
|
| 687 |
+
if __name__ == "__main__":
|
| 688 |
+
app.run(debug=True)
|
| 689 |
+
|
| 690 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask
|
| 2 |
+
matplotlib
|
| 3 |
+
gunicorn
|
| 4 |
+
pandas
|
| 5 |
+
joblib
|
| 6 |
+
seaborn
|
| 7 |
+
wordcloud
|
| 8 |
+
nltk
|
| 9 |
+
google-play-scraper
|
| 10 |
+
networkx
|
| 11 |
+
gunicorn
|
| 12 |
+
plotly
|
| 13 |
+
scikit-learn==1.2.2
|
| 14 |
+
numpy==1.25.2
|
| 15 |
+
beautifulsoup4
|
| 16 |
+
requests
|