Spaces:

Instantaneous1
/

cricket-prophet

Runtime error

App Files Files Community

Instantaneous1 commited on Dec 13, 2023

Commit

56f6887

0 Parent(s):

first commit

Browse files

Files changed (20) hide show

.gitattributes +2 -0
.github/workflows/main.yaml +20 -0
.gitignore +12 -0
.streamlit/config.toml +2 -0
README.md +26 -0
Screenshot from 2023-10-23 09-13-41.png +0 -0
__init__.py +0 -0
cricksheet.py +87 -0
eda.ipynb +153 -0
features.py +146 -0
model.py +174 -0
model/odifeatures.feather.joblib +3 -0
model/t20features.feather.joblib +3 -0
model/team.npy +0 -0
packages.txt +1 -0
requirements.txt +14 -0
scrape.py +231 -0
serve.py +472 -0
server.sh +2 -0
trainandserve.sh +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ model/t20features.feather.joblib filter=lfs diff=lfs merge=lfs -text
2	+ model/odifeatures.feather.joblib filter=lfs diff=lfs merge=lfs -text

.github/workflows/main.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://Instantaneous1:$HF_TOKEN@huggingface.co/spaces/Instantaneous1/cricket-prophet main

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+temp/
+data/
+cricsheet/
+catbosst_info/
+depr/
+env/
+__pycache__/
+result/
+static_test/
+temp/
+history/
+.ipynb_checkpoints/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [theme]
2	+ base="dark"

README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Cricket-Prophet
+#cricketprophet is an AI-Ml based cricket score prediction app. It takes account of batting team, current run, fall of wkts and gives a realistic prediction of the final score using a #randomforest
+Scores are fetched from #cricbuzz site in realtime
+The app is online at https://cricket-prophet.streamlit.app/
+It is a better prediction than the projected score as it doesn't only rely on current run rate, but also balls left, wkts left and batting team.
+#machinelearning #cricket #sportsprediction
+## ![Cricket-Prophet](<Screenshot from 2023-10-23 09-13-41.png>)
+title: Cricket Prophet
+emoji: 📈
+colorFrom: yellow
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.29.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

Screenshot from 2023-10-23 09-13-41.png ADDED Viewed

__init__.py ADDED Viewed

File without changes

cricksheet.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json, os
+import pandas as pd
+from tqdm import tqdm
+from datetime import datetime
+root = "cricsheet/all_json"
+# print([json.load(open(os.path.join(root, f)))['meta']['data_version'] for f in os.listdir(root) if json.load(open(os.path.join(root, f)))['meta']['data_version']=='1.1.0'])
+# print(set([json.load(open(os.path.join(root, f)))['info']['match_type'] for f in os.listdir(root) if f.endswith('.json') and json.load(open(os.path.join(root, f)))['meta']['data_version']=='1.1.0']))
+# formats: 'ODI', 'MDM', 'IT20', 'ODM', 'Test', 'T20'
+class Inning(object):
+    def __init__(self, df, inning, format):
+        self.df = df
+        self.inning = inning
+        self.final_score = df["run"].sum()
+        self.format = format
+    def settarget(self, target):
+        if self.inning == 1:
+            print("first innning: don't set target")
+        self.target = target
+def process_inning(ballbyball):
+    score = []
+    for over in ballbyball["overs"]:
+        overall = []
+        for ballcount, dlv in enumerate(over["deliveries"]):
+            run = dlv["runs"]["total"]
+            wicket = len(dlv.get("wickets", []))
+            if ballcount < 6:
+                overall.append((run, wicket))
+            else:
+                lastrun, lastwkt = overall.pop()
+                overall.append((run + lastrun, wicket + lastwkt))
+        score.extend(overall)
+    df = pd.DataFrame(score, columns=["run", "wicket"], index=range(1, len(score) + 1))
+    df.index.name = "balls"
+    return df
+def process_matches(matches, format):
+    print("processing jsons...")
+    ID = 0
+    for match in tqdm(matches):
+        if len(match) == 2:
+            inning1, inning2 = [
+                Inning(process_inning(inning), i + 1, format)
+                for i, inning in enumerate(match)
+            ]
+            inning2.settarget(inning1.final_score)
+            inning1.battingteam, inning2.battingteam = (
+                match[0]["team"],
+                match[1]["team"],
+            )
+            inning1.bowlingteam, inning2.bowlingteam = (
+                match[1]["team"],
+                match[0]["team"],
+            )
+            ID += 1
+            inning1.matchid = inning2.matchid = ID
+            yield inning1
+            yield inning2
+def get_all_matches(
+    format,
+    since=1990,
+):
+    matches = []
+    print("Loading jsons...")
+    for f in tqdm(os.listdir(root)[:]):
+        if f.endswith(".json"):
+            obj = json.load(open(os.path.join(root, f)))
+            if (
+                format in obj["info"]["match_type"]
+                and int(datetime.strptime(obj["info"]["dates"][0], "%Y-%m-%d").year)
+                >= since
+            ):
+                matches.append(obj["innings"])
+    return list(process_matches(matches, format))
+# get_all_T20s()

eda.ipynb ADDED Viewed

	@@ -0,0 +1,153 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "62252039-37f7-467f-bde2-0a576770d4be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e9f9088b-aab7-4faf-a158-5b6e51d1b1bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = [\n",
+    "    # \"batting_team\",\n",
+    "    # \"bowling_team\",\n",
+    "    # \"balls\",\n",
+    "    # \"runs\",\n",
+    "    # \"wickets\",\n",
+    "    \"wkt_last_5_overs\",\n",
+    "    # \"runrate_last_5_overs\",\n",
+    "    \"current_RR\",\n",
+    "    # \"average\",\n",
+    "    \"balls_left\",\n",
+    "    \"wkts_left\",\n",
+    "    # \"required_RR\",\n",
+    "    # \"projected_score_more\",\n",
+    "    # \"min_score_more\",\n",
+    "    # \"max_score_more\",\n",
+    "    # \"projected_avg_score_more\",\n",
+    "    \"runrate_last_5_overs-current_RR\",\n",
+    "    \"deviation_from_projected\",\n",
+    "]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7694e380-b6d3-4d56-af94-66f5233e6c49",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/instantinopaul/data/Code/ML/github.com/scorepredictor/env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from ydata_profiling import ProfileReport"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4845035c-3501-4eb0-a67b-0bbc5e289a8b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/instantinopaul/data/Code/ML/github.com/scorepredictor/env/lib/python3.10/site-packages/ydata_profiling/utils/dataframe.py:137: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df.rename(columns={\"index\": \"df_index\"}, inplace=True)\n",
+      "Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]/media/instantinopaul/data/Code/ML/github.com/scorepredictor/env/lib/python3.10/site-packages/ydata_profiling/model/typeset.py:125: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n",
+      "  not pdt.is_categorical_dtype(series)\n",
+      "Summarize dataset: 100%|██████████| 51/51 [00:11<00:00,  4.50it/s, Completed]                                                               \n",
+      "Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]\n",
+      "Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]\n",
+      "Export report to file: 100%|██████████| 1/1 [00:00<00:00, 210.06it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df=pd.read_feather('data/t20features.feather')\n",
+    "r=ProfileReport(df[features])\n",
+    "r.to_file('result/profilereportT20.html')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1272e5c3-2a60-4966-b70a-43f49101a5a9",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/instantinopaul/data/Code/ML/github.com/scorepredictor/env/lib/python3.10/site-packages/ydata_profiling/utils/dataframe.py:137: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df.rename(columns={\"index\": \"df_index\"}, inplace=True)\n",
+      "Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]/media/instantinopaul/data/Code/ML/github.com/scorepredictor/env/lib/python3.10/site-packages/ydata_profiling/model/typeset.py:208: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n",
+      "  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(\n",
+      "Summarize dataset: 100%|██████████| 51/51 [00:07<00:00,  7.05it/s, Completed]                                                               \n",
+      "Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]\n",
+      "Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]\n",
+      "Export report to file: 100%|██████████| 1/1 [00:00<00:00, 121.46it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df=pd.read_feather('data/odifeatures.feather')\n",
+    "r=ProfileReport(df[features])\n",
+    "r.to_file('result/profilereportODI.html')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e55fd847-05c8-47dc-aedc-9b96c23b4aa6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cricpred",
+   "language": "python",
+   "name": "cricpred"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

features.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import subprocess, sys
+from multiprocessing import Pool
+import pandas as pd, json, os, math
+import numpy as np
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+from cricksheet import get_all_matches
+# import ydata_profiling
+## Reading IPL dataset
+total_wickets = 10
+n_pools = 100
+## Feature selection/creation and ngram creation
+features = [
+    "matchid",
+    "format",
+    "inning",
+    "batting_team",
+    "bowling_team",
+    "balls",
+    "runs",
+    "wickets",
+    "wkt_last_5_overs",
+    "runrate_last_5_overs",
+    "runrate_last_5_overs-current_RR",
+    "current_RR",
+    # "average",
+    "balls_left",
+    "wkts_left",
+    # "required_RR",
+    # "projected_score_more",
+    # "min_score_more",
+    # "max_score_more",
+    # "projected_avg_score_more",
+    "final_score",
+    "final_score_more",
+    "deviation_from_projected",
+]
+getformat = {"ODI": 1, "T20": 2}
+def extract_features(inning):
+    data = []
+    # total_balls = (
+    #     120 if inning.format == "T20" else 300 if inning.format == "ODI" else None
+    # )
+    total_balls = len(inning.df)
+    df = inning.df
+    # matchid = inning.matchid
+    # batting_team = inning.battingteam
+    for i in range(1, len(df)):
+        min_RR = 0.5
+        max_RR = 2.5
+        runs = df.iloc[:i]["run"].sum()
+        run_last_5_overs = df["run"].iloc[-30:].sum()
+        runrate_last_5_overs = run_last_5_overs / 6
+        wickets = df.iloc[:i]["wicket"].sum()
+        wkt_last_5_overs = df.iloc[:i]["wicket"].iloc[-30:].sum()
+        balls = len(df.iloc[:i])
+        current_RR = (runs * 6) / balls
+        rr_diff = runrate_last_5_overs - current_RR
+        average = runs / (wickets + 1)
+        balls_left = total_balls - balls
+        wk_left = total_wickets - wickets
+        required_RR = (
+            ((inning.target - runs) * 6) / balls if inning.inning == 2 else -9999
+        )
+        projected_score_more = current_RR * balls_left / 6
+        min_score_more = min_RR * balls_left / 6
+        max_score_more = max_RR * balls_left / 6
+        projected_avg_score_more = average * wk_left / 6
+        final_score_more = inning.final_score - runs
+        format = getformat[inning.format]
+        deviation_from_projected = final_score_more - projected_score_more
+        data.append(
+            (
+                inning.matchid,
+                format,
+                inning.inning,
+                inning.battingteam,
+                inning.bowlingteam,
+                balls,
+                runs,
+                wickets,
+                wkt_last_5_overs,
+                round(runrate_last_5_overs, 2),
+                round(rr_diff, 2),
+                round(current_RR, 2),
+                # average,
+                balls_left,
+                wk_left,
+                # required_RR,
+                # projected_score_more,
+                # min_score_more,
+                # max_score_more,
+                # projected_avg_score_more,
+                inning.final_score,
+                final_score_more,
+                round(deviation_from_projected),
+            )
+        )
+    return data
+def save_features(innings, fname):
+    print("Feature enggineering and ngram creation...")
+    n_innings = len(innings)
+    print(f"{n_innings=}")
+    pool = Pool(processes=n_pools)
+    Xy = pool.map(extract_features, innings)
+    Xy = [xi for Xi in Xy for xi in Xi]
+    print(f"{len(Xy)=}")
+    featuresdf = pd.DataFrame(Xy, columns=features)
+    # ydata_profiling.ProfileReport(featuresdf, title=fname).to_file(fname + ".html")
+    featuresdf.to_feather(fname)
+    featuresdf.to_csv(fname + ".csv")
+if __name__ == "__main__":
+    print("Loading t20 data...")
+    innings = get_all_matches(format="T20", since=2021)
+    print("Saving t20 data")
+    save_features(innings, "data/t20features.feather")
+    print("Loading odi data...")
+    innings = get_all_matches(format="ODI", since=2021)
+    print("Saving odi data")
+    save_features(innings, "data/odifeatures.feather")

model.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler
+import math
+import matplotlib.pyplot as plt, joblib
+# from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+# from sklearn.tree import DecisionTreeRegressor
+# from catboost import CatBoostRegressor
+import warnings, random
+from sklearn.metrics import mean_absolute_error as mae
+from sklearn.metrics import mean_squared_error as mse
+# from sklearn import tree
+# from sklearn.svm import SVR
+# from sklearn.ensemble import VotingRegressor
+import os
+warnings.filterwarnings("ignore")
+features = [
+    "batting_team",
+    # "bowling_team",
+    # "balls",
+    # "runs",
+    # "wickets",
+    "wkt_last_5_overs",
+    # "runrate_last_5_overs",
+    "current_RR",
+    # "average",
+    "balls_left",
+    "wkts_left",
+    # "required_RR",
+    # "projected_score_more",
+    # "min_score_more",
+    # "max_score_more",
+    # "projected_avg_score_more",
+    "runrate_last_5_overs-current_RR",
+]
+target = "deviation_from_projected"
+# evaluate
+def evaluate(model, featuresdf, x_test, fname):
+    predictdf = featuresdf.loc[x_test.index].copy()
+    # print(predictdf.columns)
+    predictdf["h_deviation_from_projected"] = model.predict(
+        featuresdf.loc[x_test.index][features]
+    )
+    predictdf["error"] = (
+        predictdf["h_deviation_from_projected"] - predictdf["deviation_from_projected"]
+    )
+    predictdf["abs_error"] = predictdf["error"].abs()
+    ax = plt.gca()
+    plt.plot(predictdf.groupby("balls").aggregate({"abs_error": "mean"}))
+    plt.legend("Abs deviation")
+    # ax.set_ylim([-50, 50])
+    plt.title(type(model).__name__)
+    plt.xlabel("Balls on which prediction was made")
+    plt.ylabel("Mean Abs Prediction error")
+    plt.savefig("result/" + fname + ".png")
+    plt.clf()
+    predictdf.sample(frac=0.0001).to_csv("result/" + fname + "_sample.csv")
+    # fig = plt.figure(figsize=(25, 20))
+    # tree.plot_tree(model)
+    # fig.savefig(fname + ".png")
+    # plt.clf()batting_teamsort_values("overs", ascending=False).to_string(index=False))
+def train_test_split_matchid(df, matchids, split=0.2):
+    unique_match_ids = set(matchids)
+    print(f"{len(unique_match_ids)=}")
+    testids = random.sample(unique_match_ids, int(len(unique_match_ids) * split))
+    trainids = list(unique_match_ids.difference(testids))
+    return (
+        df[features][df.matchid.isin(trainids)],
+        df[features][df.matchid.isin(testids)],
+        df[target][df.matchid.isin(trainids)],
+        df[target][df.matchid.isin(testids)],
+    )
+def encode_teams(series):
+    encoder = LabelEncoder()
+    encoder.fit(series)
+    np.save("model/team.npy", encoder.classes_)
+def transform_teams(series):
+    encoder = LabelEncoder()
+    encoder.classes_ = np.load("model/team.npy", allow_pickle=True)
+    return encoder.transform(np.array(series).reshape(-1, 1)).reshape(-1)
+def plot_feature_importance(f, imp, fname):
+    importance = (
+        pd.DataFrame(
+            zip(*[f, imp]),
+            columns=["feature", "importance"],
+        )
+        .sort_values("importance", ascending=False)
+        .set_index("feature")
+    )
+    importance["importance"] = importance["importance"] / importance["importance"].sum()
+    fig, ax = plt.subplots()
+    importance.plot.bar(ax=ax)
+    ax.bar_label(ax.containers[0], labels=f, rotation=90, label_type="center")
+    ax.set_xticks([])
+    ax.set_title("Feature importances for predicted score " + fname)
+    ax.set_ylabel("Significance")
+    ax.set_xlabel("Features")
+    plt.savefig("result/" + fname + "featureimp.png")
+    plt.clf()
+def train(fname, max_depth=-1):
+    print("training on", fname, "...")
+    featuresdf = pd.read_feather(fname)
+    featuresdf = featuresdf[featuresdf["inning"] == 2]
+    encode_teams(
+        featuresdf["batting_team"].to_list() + featuresdf["bowling_team"].to_list()
+    )
+    featuresdf["batting_team"] = transform_teams(featuresdf["batting_team"])
+    featuresdf["bowling_team"] = transform_teams(featuresdf["bowling_team"])
+    x_train, x_test, y_train, y_test = train_test_split_matchid(
+        featuresdf, featuresdf["matchid"], 0.2
+    )
+    print(f"{len(x_train)=} {len(x_test)=}")
+    model = RandomForestRegressor(max_depth=8)
+    model.fit(x_train, y_train)
+    # for xgb
+    # plot_feature_importance(
+    #     model.get_booster().get_score(importance_type="gain").keys(),
+    #     model.get_booster().get_score(importance_type="gain").values(),
+    #     fname,
+    # )
+    # for rf
+    plot_feature_importance(
+        features,
+        np.std([tree.feature_importances_ for tree in model.estimators_], axis=0),
+        os.path.basename(fname),
+    )
+    print("Depth:", [e.tree_.max_depth for e in model.estimators_])
+    # for dt
+    # plot_feature_importance(
+    #     features,
+    #     model.feature_importances_,
+    #     fname,
+    # )
+    # print(model.tree_.max_depth)
+    # print(f"{model.score(x_train, y_train)=}, {model.score(x_test, y_test)=}")
+    print(
+        f"{mse(model.predict(x_train), y_train, squared=False)=}, {mse(model.predict(x_test), y_test, squared=False)=}"
+    )
+    evaluate(model, featuresdf, x_test, os.path.basename(fname))
+    model.fit(featuresdf[features], featuresdf[target])
+    joblib.dump(model, f"model/{os.path.basename(fname)}.joblib")
+    return model
+if __name__ == "__main__":
+    train("data/t20features.feather")
+    train("data/odifeatures.feather")

model/odifeatures.feather.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:901b2b44a7095ce014a98f2dc5989d06691bb1c39ff9ea6e0a3496c3eb44331d
+size 3497985

model/t20features.feather.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10fc5bd06a3cf347b0eede72942802189138cea8e607a889330f59565fc87db8
+size 3706353

model/team.npy ADDED Viewed

Binary file (2.34 kB). View file

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ chromium

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+scikit-learn
+pandas
+numpy
+streamlit-echarts
+streamlit
+tqdm
+matplotlib
+beautifulsoup4
+selenium
+webdriver-manager
+chromedriver_autoinstaller
+seleniumbase
+streamlit-analytics
+ydata-profiling

scrape.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import requests
+from bs4 import BeautifulSoup
+import re
+from urllib.parse import urljoin
+import numpy as np
+from sklearn.preprocessing import LabelEncoder
+import traceback
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+import chromedriver_autoinstaller
+from selenium.common import exceptions
+chromedriver_autoinstaller.install()
+options = webdriver.ChromeOptions()
+options.add_argument("--headless")
+options.add_argument("--disable-dev-shm-usage")
+options.add_argument("--no-sandbox")
+def selnium(url):
+    try:
+        driver = webdriver.Chrome(options=options)
+        driver.get(url)
+        with open("temp/temp.html", "w+") as f:
+            f.write(driver.page_source)
+        driver.quit()
+        return True
+    except exceptions.InvalidSessionIdException as e:
+        print(traceback.format_exc())
+        print(e.message)
+        return False
+    except BaseException as e:
+        print(traceback.format_exc())
+        print(e.message)
+        return False
+def get_batting_team(soup, status, inning, teams_this_match):
+    # teams_this_match = sorted(
+    #     np.load("team.npy", allow_pickle=True),
+    #     key=lambda x: soup.text.lower().count(x.lower()),
+    # )[-2:]
+    # print(f"{teams_this_match=}")
+    batting_team = ""
+    if inning == 2:
+        batting_team = status.split("need")[0].strip()
+        for idx, team in enumerate(teams_this_match):
+            if team.lower() in batting_team.lower():
+                batting_team = team
+    else:
+        for idx, team in enumerate(teams_this_match):
+            if team.lower() in status.lower():
+                if "opt to bowl" in status.lower():
+                    batting_team = teams_this_match[int(~idx)]
+                elif "opt to bat" in status.lower():
+                    batting_team = team
+                else:
+                    print("Could not get batting team)")
+    bowling_team = list(set(teams_this_match).difference([batting_team]))[0]
+    print(f"{batting_team=}, {bowling_team=}")
+    batting_team_enc, bowling_team_enc = None, None
+    le = LabelEncoder()
+    le.classes_ = np.load("model/team.npy", allow_pickle=True)
+    if batting_team in le.classes_:
+        batting_team_enc = le.transform([batting_team])[0]
+    if bowling_team in le.classes_:
+        bowling_team_enc = le.transform([bowling_team])[0]
+    return batting_team, bowling_team, batting_team_enc, bowling_team_enc
+def scrape(url):
+    try:
+        if selnium(url) is False:
+            return ("Selenium scrape error",)
+        soup = BeautifulSoup(open("temp/temp.html", "r").read(), "html.parser")
+        # print("Debug>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.", soup.text)
+        matchState = re.findall(
+            'var matchState ="([\da-zA-Z]*)"',
+            "\n".join(map(lambda x: x.text, soup.find_all("script"))),
+        )[0].lower()
+        print(f"{matchState=}")
+        title = soup.find_all("title")[0].text
+        format = re.findall(
+            'var matchFormat = "([\da-zA-Z]*)"',
+            "\n".join(map(lambda x: x.text, soup.find_all("script"))),
+        )[0]
+        print(f"{format=}")
+        if format not in {"ODI", "T20"}:
+            raise BaseException("Not ODI or T20")
+        status = (
+            soup.find_all("div", {"class": "cb-text-inprogress"})[0].text
+            if matchState == "inprogress"
+            else soup.find_all("div", {"class": "cb-text-complete"})[0].text
+            if matchState == "complete"
+            else soup.find_all("div", {"class": "cb-text-inningsbreak"})[0].text
+            if matchState == "inningsbreak"
+            else ""
+        )
+        score = (
+            soup.find_all("div", {"class": "cb-min-bat-rw"})[0].text
+            if matchState in ["complete", "inprogress", "inningbreak"]
+            else ""
+        )
+        if matchState != "inprogress":
+            return (
+                matchState,
+                score,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                format,
+                title,
+                status,
+                None,
+                None,
+                None,
+                None,
+                None,
+            )
+        teams_this_match = re.match(
+            r"(.*) vs (.*)",
+            soup.find_all("a", {"class": "cb-nav-tab"})[0]["title"].split(",")[0],
+        ).groups()
+        print(f"{teams_this_match=}")
+        data = re.findall("(\d+)/(\d+) \(([\.\d]+)\)", soup.text)
+        runs, wkts, overs = map(float, data[-1])
+        print(f"{runs=}, {wkts=}, {overs=}")
+        if overs >= 5:
+            last_5_ovs = (
+                soup.find_all("span", string="Last 5 overs")[0].findNext("span").text
+            )
+            run_last_5_overs, wkt_last_5_overs = map(
+                float, re.match("(\d+) runs, (\d+) wkts", last_5_ovs).groups()
+            )
+        else:
+            run_last_5_overs, wkt_last_5_overs = runs, wkts
+        print(f"{run_last_5_overs=}, {wkt_last_5_overs=}")
+        req_rr = -9999
+        if soup.find_all("span", string="\xa0\xa0REQ:\xa0"):
+            reqdata = (
+                soup.find_all("span", string="\xa0\xa0REQ:\xa0")[0]
+                .findNext("span")
+                .text
+            )
+            if reqdata.strip() != "":
+                req_rr = list(map(float, re.match("([\d\.]+)", reqdata).groups()))[0]
+        else:
+            print("REQ_RR not parsed")
+        crr = -9999
+        if soup.find_all("span", string="\xa0\xa0CRR:\xa0"):
+            crrdata = (
+                soup.find_all("span", string="\xa0\xa0CRR:\xa0")[0]
+                .findNext("span")
+                .text
+            )
+            if crrdata.strip() != "":
+                crr = list(map(float, re.match("([\d\.]+)", crrdata).groups()))[0]
+        else:
+            print("CRR not parsed")
+        print(f"{crr=}, {req_rr=}")
+        inning = 2 if req_rr > 0 else 1
+        (
+            batting_team,
+            bowling_team,
+            batting_team_enc,
+            bowling_team_enc,
+        ) = get_batting_team(soup, status, inning, teams_this_match)
+        req = -9999
+        if inning == 2:
+            req = int(re.match(r".*need (\d+) runs", status).groups()[0])
+            print(f"{req=}")
+        else:
+            print("Not chasing so target not set")
+        return (
+            matchState,
+            score,
+            run_last_5_overs,
+            wkt_last_5_overs,
+            runs,
+            wkts,
+            overs,
+            req_rr,
+            req,
+            crr,
+            format,
+            title,
+            status,
+            batting_team,
+            bowling_team,
+            batting_team_enc,
+            bowling_team_enc,
+            inning,
+        )
+    except BaseException as e:
+        print(traceback.format_exc())
+        return (str(e),)
+def get_live_matches(url):
+    if selnium(url) is False:
+        return None
+    soup = BeautifulSoup(open("temp/temp.html", "r").read(), "html.parser")
+    matches = soup.find_all("a", {"class": "cb-mat-mnu-itm cb-ovr-flo"})
+    return {
+        m.text: urljoin(url, m.get("href"))
+        for m in matches
+        if m not in soup.find_all("a", {"id": "live-scores-link"})
+    }
+if __name__ == "__main__":
+    url = "https://cricbuzz.com/live-cricket-scores/79055/wa-vs-saus-3rd-match-australia-domestic-one-day-cup-2023-24"
+    print(scrape(url))
+    # print(get_live_matches("https://cricbuzz.com"))

serve.py ADDED Viewed

	@@ -0,0 +1,472 @@

+from scrape import scrape, get_live_matches
+import pandas as pd
+import streamlit as st
+from streamlit_echarts import st_echarts
+import joblib
+import numpy as np
+import math, os
+import datetime, time
+import matplotlib.pyplot as plt
+import pathlib
+for folder in ["data", "model", "history", "result", "temp"]:
+    pathlib.Path(folder).mkdir(parents=True, exist_ok=True)
+# ## Test on realdata
+# In[16]:
+from model import features
+import streamlit_analytics
+# features = [
+#     "batting_team",
+#     "balls",
+#     "runs",
+#     "wickets",
+#     "wkt_last_5_overs",
+#     "runrate_last_5_overs",
+#     "current_RR",
+#     "average",
+#     "balls_left",
+#     "wkts_left",
+#     "required_RR",
+#     "projected_score_more",
+#     "min_score_more",
+#     "max_score_more",
+#     "projected_avg_score_more",
+# ]
+all_teams_enc = list(range(len(np.load("model/team.npy", allow_pickle=True))))
+def overtoball(over):
+    over = str(over)
+    full = int(over.split(".")[0]) * 6
+    part = min(int(over.split(".")[-1]), 6)
+    print(f"{over=}", "balls=", full + part)
+    return full + part
+def save_history(fname, row, total_balls):
+    row.to_csv(
+        os.path.join("history", fname),
+        mode="a" if os.path.isfile(os.path.join("history", fname)) else "w+",
+        header=not os.path.isfile(os.path.join("history", fname)),
+    )
+    fig, ax = plt.subplots()
+    historydf = pd.read_csv(os.path.join("history", fname))
+    balls = (total_balls - historydf["balls_left"]).to_list()
+    runs = historydf["runs"].astype(int).to_list()
+    ax.plot(balls, runs, label="So Far")
+    balls.append(total_balls)
+    pred_runs = runs + [historydf["predicted"].astype(int).iloc[-1]]
+    ax.plot(balls[-2:], pred_runs[-2:], label="Predicted")
+    proj_runs = runs + [historydf["projected"].astype(int).iloc[-1]]
+    ax.plot(balls[-2:], proj_runs[-2:], label="Projected")
+    ax.annotate(str(runs[-1]), xy=(balls[-2], runs[-1]))
+    ax.annotate(str(pred_runs[-1]), xy=(balls[-1], pred_runs[-1]))
+    ax.annotate(str(proj_runs[-1]), xy=(balls[-1], proj_runs[-1]))
+    plt.xlim([0, total_balls])
+    plt.ylim([0, max(pred_runs[-1], proj_runs[-1]) + 100])
+    ax.set_xlabel("Balls")
+    ax.set_ylabel("Runs")
+    ax.legend()
+    return fig
+def load_model(format):
+    return joblib.load(
+        "model/"
+        + (
+            "t20features.feather.joblib"
+            if format == "T20"
+            else "odifeatures.feather.joblib"
+            if format == "ODI"
+            else None
+        )
+    )
+def simulator(args, format):
+    inputdf = pd.DataFrame([args.values()], columns=args.keys())
+    model = load_model(format)
+    h = model.predict(inputdf)
+    return h
+def predict(url):
+    fname = "".join(list(filter(str.isalnum, url))) + ".csv"
+    ret = scrape(url)
+    print(ret)
+    if len(ret) == 1:
+        err = ret[0]
+        return [err]
+    else:
+        (
+            matchState,
+            score,
+            run_last_5_overs,
+            wkt_last_5_overs,
+            runs,
+            wkts,
+            overs,
+            req_rr,
+            req,
+            current_rr,
+            format,
+            title,
+            status,
+            batting_team,
+            bowling_team,
+            batting_team_enc,
+            bowling_team_enc,
+            inning,
+        ) = ret
+    if matchState != "inprogress":
+        return matchState, None, score, format, title, status, None, None, None, None
+    total_balls = 120 if format == "T20" else 300 if format == "ODI" else None
+    balls = overtoball(overs)
+    rr_last_5_overs = (int(run_last_5_overs) * 6) / min(30, balls)
+    # current_rr = (runs * 6) / balls
+    avg = runs / (wkts + 1)
+    req_rr = req_rr
+    wkts_left = 10 - wkts
+    balls_left = (total_balls - balls) if inning == 1 else math.ceil(req * 6 / req_rr)
+    min_score_avg, max_score_avg = (
+        math.ceil(balls_left * 0.5),
+        math.ceil(balls_left * 3),
+    )
+    rr_diff = rr_last_5_overs - current_rr
+    inputs = {
+        "batting_team": batting_team_enc,
+        "balls": balls,
+        "runs": runs,
+        "wickets": wkts,
+        "wkt_last_5_overs": wkt_last_5_overs,
+        "runrate_last_5_overs": rr_last_5_overs,
+        "current_RR": current_rr,
+        "runrate_last_5_overs-current_RR": rr_diff,
+        "average": avg,
+        "balls_left": int(balls_left),
+        "wkts_left": int(wkts_left),
+        "required_RR": -9999,
+        "projected_score_more": math.ceil(balls_left * ((runs) / (balls))),
+        "min_score_more": math.ceil(balls_left * 0.5),
+        "max_score_more": math.ceil(balls_left * 3),
+        "projected_avg_score_more": math.ceil((10 - wkts) * runs / (1 + wkts)),
+    }
+    inputdf = pd.DataFrame(inputs, index=[0])
+    if batting_team_enc is None:
+        inputdf = inputdf.drop(columns=["batting_team"])
+        inputdf = pd.concat([inputdf] * len(all_teams_enc))
+        inputdf["batting_team"] = all_teams_enc
+    inputdf = inputdf[features]
+    model = load_model(format)
+    h = model.predict(inputdf)
+    print(f"{h=}")
+    projected_score_more = balls_left * current_rr / 6
+    projected = math.ceil(projected_score_more + runs)
+    predicted_score_more = math.ceil(h.mean() + projected_score_more)
+    # predicted_score_more = min(max(min_score_avg, predicted_score_more), max_score_avg)
+    predicted = runs + predicted_score_more
+    print(f"{runs=}, {projected=}, {predicted=}")
+    inputdf["timestamp"] = datetime.datetime.now()
+    inputdf["runs"] = runs
+    if inning == 2:
+        target = req + runs
+        print(f"{target=}")
+        inputdf["target"] = target
+        batting_team_win = int(predicted - target)
+    else:
+        batting_team_win = None
+        inputdf["target"] = -9999
+    inputdf["predicted"] = int(predicted)
+    inputdf["projected"] = int(projected)
+    print(inputdf.to_string())
+    fig = save_history(fname, inputdf, total_balls)
+    return (
+        matchState,
+        predicted,
+        score,
+        format,
+        title,
+        status,
+        inning,
+        batting_team,
+        batting_team_win,
+        fig,
+    )
+def getoption(predicted, maxscore):
+    return {
+        "series": [
+            {
+                "type": "gauge",
+                "startAngle": 180,
+                "endAngle": 0,
+                "min": 0,
+                "max": maxscore,
+                "center": ["50%", "50%"],
+                "splitNumber": 4,
+                "axisLine": {
+                    "lineStyle": {
+                        "width": 6,
+                        "color": [
+                            [0.25, "#FF403F"],
+                            [0.5, "#FDDD60"],
+                            [0.75, "#00FF00"],
+                            [1, "#0000FF"],
+                        ],
+                    }
+                },
+                "pointer": {
+                    "icon": "path://M12.8,0.7l12,40.1H0.7L12.8,0.7z",
+                    "length": "12%",
+                    "width": 30,
+                    "offsetCenter": [0, "-60%"],
+                    "itemStyle": {"color": "auto"},
+                },
+                "axisTick": {
+                    "length": 10,
+                    "lineStyle": {"color": "auto", "width": 2},
+                },
+                "splitLine": {
+                    "length": 15,
+                    "lineStyle": {"color": "auto", "width": 5},
+                },
+                "axisLabel": {
+                    "fontSize": 12,
+                    "distance": -60,
+                },
+                "title": {
+                    "offsetCenter": [0, "-20%"],
+                    "fontSize": 20,
+                    "color": "#0000FF"
+                    if predicted > maxscore * 0.75
+                    else "#00FF00"
+                    if predicted > maxscore * 0.5
+                    else "#FDDD60"
+                    if predicted > maxscore * 0.25
+                    else "#FF403F",
+                },
+                "detail": {
+                    "fontSize": 15,
+                    "offsetCenter": [0, "0%"],
+                    "valueAnimation": True,
+                    "color": "auto",
+                    "formatter": "Predicted Score: {value}",
+                },
+                "data": [
+                    {
+                        "value": round(predicted),
+                    }
+                    # {
+                    #     "value": round(predicted),
+                    #     "name": "Great"
+                    #     if predicted > maxscore * 0.75
+                    #     else "Decent"
+                    #     if predicted > maxscore * 0.5
+                    #     else "Average"
+                    #     if predicted > maxscore * 0.25
+                    #     else "Bad",
+                    # }
+                ],
+            }
+        ]
+    }
+def timestamp(func):
+    def caller(*args):
+        print(
+            "\n---->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Initiated:  ",
+            datetime.datetime.now(),
+            "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<----",
+        )
+        ret = func(*args)
+        print(
+            "\n---->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Completed:  ",
+            datetime.datetime.now(),
+            "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<----",
+        )
+        return ret
+    return caller
+@timestamp
+def render(url):
+    markdown = []
+    option = None
+    print("fetching from", url)
+    ret = predict(url.strip())
+    if len(ret) == 1:
+        err = ret[0]
+        markdown.append("Error fetching url...")
+        return markdown, None, None
+    (
+        matchState,
+        predicted,
+        score,
+        format,
+        title,
+        status,
+        inning,
+        batting_team,
+        batting_team_win,
+        fig,
+    ) = ret
+    if matchState:
+        markdown.append("Live score credits: cricbuzz.com")
+        if title:
+            if "|" in title:
+                l1 = (
+                    title.split("|")[1]
+                    .replace("Cricbuzz.com", "")
+                    .replace("Cricbuzz", "")
+                )
+                if l1.strip():
+                    markdown.append(l1.strip())
+                l2 = (
+                    title.split("|")[0]
+                    .replace("Cricbuzz.com", "")
+                    .replace("Cricbuzz", "")
+                )
+                if l2.strip():
+                    markdown.append(l2.strip())
+            else:
+                markdown.append(
+                    title.replace("Cricbuzz.com", "").replace("Cricbuzz", "")
+                )
+        nutshell = ""
+        if status:
+            nutshell += status + "; "
+        if score:
+            nutshell += score + "; "
+        if matchState:
+            nutshell += matchState + "; "
+        if nutshell:
+            markdown.append(nutshell)
+    # if matchState and matchState != "inprogress":
+    #     markdown.append(matchState)
+    if predicted:
+        if inning == 2:
+            if batting_team_win >= 0:
+                markdown.append(f"{batting_team} may win")
+            else:
+                markdown.append(
+                    f"{batting_team} may lose by {-int(batting_team_win)} runs"
+                )
+        maxscore = 300 if format == "T20" else 500 if format == "ODI" else None
+        option = getoption(predicted, maxscore)
+    if matchState is None:
+        markdown.append("Error fetching url...")
+    return "\n".join(markdown), option, fig
+if __name__ == "__main__":
+    with streamlit_analytics.track(unsafe_password="credict123"):
+        st.set_page_config(page_title="Cricket Prophet")
+        st.title("Cricket Prophet")
+        st.write("**An ML-driven Cricket Score Predictor**")
+        live_matches = get_live_matches("https://cricbuzz.com")
+        if live_matches:
+            option = st.selectbox(
+                "Choose a live match here",
+                list(live_matches.keys()) + ["Custom URL", "Simulator"],
+            )
+            if option == "Simulator":
+                format = st.selectbox("Format", ["T20", "ODI"])
+                args = {}
+                args["batting_team"] = 1
+                args["wkt_last_5_overs"] = st.number_input(
+                    "wkt_last_5_overs", value=0.0, step=0.01, format="%f"
+                )
+                args["current_RR"] = st.number_input(
+                    "current_RR", value=0.0, step=0.01, format="%f"
+                )
+                args["balls_left"] = st.number_input(
+                    "balls_left", value=0.0, step=0.01, format="%f"
+                )
+                args["wkts_left"] = st.number_input(
+                    "wkts_left", value=0.0, step=0.01, format="%f"
+                )
+                args["runrate_last_5_overs-current_RR"] = (
+                    st.number_input(
+                        "runrate_last_5_overs", value=0.0, step=0.01, format="%f"
+                    )
+                    - args["current_RR"]
+                )
+                balls = 300 if format == "ODI" else 120
+                st.text(
+                    str(int((balls * args["current_RR"] / 6) + simulator(args, format)))
+                )
+            else:
+                if option == "Custom URL":
+                    url = st.text_input("Enter cricbuzz match link")
+                else:
+                    url = live_matches.get(option)
+                col1, col2 = st.columns([3.5, 0.6])
+                with col1:
+                    live = st.button("Live", help="Livestream")
+                with col2:
+                    fetch = st.button("Fetch", help="Refresh")
+                col3, _ = st.columns([1, 4])
+                with col3:
+                    interval = st.number_input(
+                        label="Sync Interval (Seconds)", step=1, min_value=1, value=100
+                    )
+                placeholder = st.empty()
+                if fetch:
+                    if url:
+                        markdown, option, fig = render(url)
+                        placeholder.empty()
+                        with placeholder.container():
+                            st.text(markdown)
+                            st.text(f"Last updated at {time.strftime('%H:%M %p')}")
+                            if option:
+                                st_echarts(
+                                    option,
+                                    width="450px",
+                                    height="350px",
+                                    key="gauge" + str(datetime.datetime.now()),
+                                )
+                                if fig:
+                                    st.pyplot(fig)
+                if live:
+                    if url:
+                        while True:
+                            markdown, option, fig = render(url)
+                            placeholder.empty()
+                            with placeholder.container():
+                                st.text(markdown)
+                                st.text(f"Last updated at {time.strftime('%H:%M %p')}")
+                                if option:
+                                    st_echarts(
+                                        option,
+                                        width="450px",
+                                        height="350px",
+                                        key="gauge" + str(datetime.datetime.now()),
+                                    )
+                                    if fig:
+                                        st.pyplot(fig)
+                                else:
+                                    break
+                            time.sleep(interval)
+        else:
+            st.text("Error fetching matches")

server.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #!/bin/zsh
2	+ source env/bin/activate && streamlit run serve.py

trainandserve.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/bin/zsh
+mkdir -p data history model result temp
+source env/bin/activate && pip install -r requirements.txt &&  python features.py && python model.py &&  streamlit run serve.py