Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- .gitattributes +3 -0
- .history/README_20251005095904.md +0 -0
- .history/README_20251005100625.md +0 -0
- .history/README_20251005103318.md +0 -0
- .history/README_20251005103328.md +0 -0
- .history/README_20251005103511.md +0 -0
- .history/README_20251005103517.md +0 -0
- .history/README_20251007193812.md +0 -0
- .history/README_20251007193817.md +0 -0
- .history/README_20251007193828.md +0 -0
- .history/README_20251007193832.md +0 -0
- .history/fbrefdata_example_20251004173710.py +49 -0
- .history/fbrefdata_example_20251004180332.py +0 -0
- .history/fbrefdata_example_20251004180335.py +43 -0
- .history/fbrefdata_example_20251004180434.py +60 -0
- .history/fbrefdata_example_20251004180520.py +74 -0
- .history/fbrefdata_example_20251004180621.py +67 -0
- .history/fbrefdata_example_20251004184139.py +72 -0
- .history/fbrefdata_example_20251004185739.py +65 -0
- .history/fbrefdata_example_20251004185920.py +68 -0
- .history/fbrefdata_example_20251004190022.py +0 -0
- .history/fbrefdata_example_20251004190027.py +69 -0
- .history/fbrefdata_example_20251004190339.py +82 -0
- .history/fbrefdata_example_20251004190507.py +85 -0
- .history/fbrefdata_example_20251004190633.py +90 -0
- .history/fbrefdata_example_20251004190944.py +91 -0
- .history/fbrefdata_example_20251004191947.py +107 -0
- .history/fbrefdata_example_20251005091604.py +104 -0
- .history/fbrefdata_example_20251005091825.py +104 -0
- .history/fbrefdata_example_20251005091830.py +104 -0
- .history/fbrefdata_example_20251005091835.py +104 -0
- .history/fbrefdata_example_20251005091839.py +104 -0
- .history/fbrefdata_example_20251005091854.py +104 -0
- .history/fbrefdata_example_20251005091857.py +104 -0
- .history/fbrefdata_example_20251005091858.py +104 -0
- .history/fbrefdata_example_20251005092140.py +106 -0
- .history/fbrefdata_example_20251005092144.py +105 -0
- .history/fbrefdata_example_20251005092150.py +105 -0
- .history/fbrefdata_example_20251005092800.py +106 -0
- .history/fbrefdata_example_20251005092803.py +105 -0
- .history/fbrefdata_example_20251005092809.py +105 -0
- .history/fbrefdata_example_20251005092817.py +106 -0
- .history/fbrefdata_example_20251005092820.py +106 -0
- .history/fbrefdata_example_20251005092822.py +105 -0
- .history/fbrefdata_example_20251005092904.py +131 -0
- .history/fbrefdata_example_20251005093119.py +61 -0
- .history/fbrefdata_example_20251005093129.py +61 -0
- .history/fbrefdata_example_20251005093230.py +131 -0
- .history/historical_data_20251005104339.py +0 -0
- .history/historical_data_20251005104343.py +140 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
visual/debug_screenshot.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
visual/debug_team_stats.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
visual/top10_passing_accuracy.png filter=lfs diff=lfs merge=lfs -text
|
.history/README_20251005095904.md
ADDED
|
Binary file (60 Bytes). View file
|
|
|
.history/README_20251005100625.md
ADDED
|
Binary file (9.64 kB). View file
|
|
|
.history/README_20251005103318.md
ADDED
|
Binary file (9.65 kB). View file
|
|
|
.history/README_20251005103328.md
ADDED
|
Binary file (9.57 kB). View file
|
|
|
.history/README_20251005103511.md
ADDED
|
Binary file (5.72 kB). View file
|
|
|
.history/README_20251005103517.md
ADDED
|
Binary file (9.57 kB). View file
|
|
|
.history/README_20251007193812.md
ADDED
|
Binary file (9.75 kB). View file
|
|
|
.history/README_20251007193817.md
ADDED
|
Binary file (9.74 kB). View file
|
|
|
.history/README_20251007193828.md
ADDED
|
Binary file (9.73 kB). View file
|
|
|
.history/README_20251007193832.md
ADDED
|
Binary file (9.7 kB). View file
|
|
|
.history/fbrefdata_example_20251004173710.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
|
| 5 |
+
def pull_premier_league_team_passing():
|
| 6 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 7 |
+
print(f"Downloading team passing stats from {url} ...")
|
| 8 |
+
|
| 9 |
+
# Add a User-Agent header to mimic a browser
|
| 10 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
| 11 |
+
response = requests.get(url, headers=headers)
|
| 12 |
+
response.raise_for_status()
|
| 13 |
+
|
| 14 |
+
df = pd.read_html(StringIO(response.text))[0]
|
| 15 |
+
|
| 16 |
+
# Flatten columns
|
| 17 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 18 |
+
|
| 19 |
+
# Rename the weird columns
|
| 20 |
+
df = df.rename(columns={
|
| 21 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 22 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 23 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 24 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 25 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 26 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 27 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 28 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 29 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 30 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
return df
|
| 34 |
+
|
| 35 |
+
def filter_teams(df, teams):
|
| 36 |
+
return df[df["Squad"].isin(teams)]
|
| 37 |
+
|
| 38 |
+
def main():
|
| 39 |
+
df = pull_premier_league_team_passing()
|
| 40 |
+
|
| 41 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 42 |
+
df_filtered = filter_teams(df, teams)
|
| 43 |
+
|
| 44 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 45 |
+
print("=" * 70)
|
| 46 |
+
print(df_filtered[["Squad", "Total_Cmp", "Total_Att", "Total_Cmp%", "Total_TotDist"]])
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
main()
|
.history/fbrefdata_example_20251004180332.py
ADDED
|
File without changes
|
.history/fbrefdata_example_20251004180335.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
|
| 5 |
+
def pull_premier_league_team_passing():
|
| 6 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 7 |
+
print(f"Downloading team passing stats from {url} ...")
|
| 8 |
+
|
| 9 |
+
# Use a more comprehensive set of headers to mimic a real browser
|
| 10 |
+
headers = {
|
| 11 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
| 12 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
| 13 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 14 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 15 |
+
'Connection': 'keep-alive',
|
| 16 |
+
'Upgrade-Insecure-Requests': '1',
|
| 17 |
+
'DNT': '1' # Do Not Track request header
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
response = requests.get(url, headers=headers)
|
| 21 |
+
response.raise_for_status()
|
| 22 |
+
|
| 23 |
+
# The rest of your function remains the same
|
| 24 |
+
df = pd.read_html(StringIO(response.text))[0]
|
| 25 |
+
|
| 26 |
+
# Flatten columns
|
| 27 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 28 |
+
|
| 29 |
+
# Rename the weird columns
|
| 30 |
+
df = df.rename(columns={
|
| 31 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 32 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 33 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 34 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 35 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 36 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 37 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 38 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 39 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 40 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 41 |
+
})
|
| 42 |
+
|
| 43 |
+
return df
|
.history/fbrefdata_example_20251004180434.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
|
| 5 |
+
def pull_premier_league_team_passing():
|
| 6 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 7 |
+
print(f"Downloading team passing stats from {url} ...")
|
| 8 |
+
|
| 9 |
+
# Gunakan headers yang lebih lengkap untuk meniru browser asli
|
| 10 |
+
headers = {
|
| 11 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
| 12 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
| 13 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 14 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 15 |
+
'Connection': 'keep-alive',
|
| 16 |
+
'Upgrade-Insecure-Requests': '1',
|
| 17 |
+
'DNT': '1'
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
response = requests.get(url, headers=headers)
|
| 21 |
+
response.raise_for_status()
|
| 22 |
+
|
| 23 |
+
df = pd.read_html(StringIO(response.text))[0]
|
| 24 |
+
|
| 25 |
+
# Meratakan kolom
|
| 26 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 27 |
+
|
| 28 |
+
# Mengganti nama kolom yang aneh
|
| 29 |
+
df = df.rename(columns={
|
| 30 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 31 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 32 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 33 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 34 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 35 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 36 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 37 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 38 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 39 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 40 |
+
})
|
| 41 |
+
|
| 42 |
+
return df
|
| 43 |
+
|
| 44 |
+
def filter_teams(df, teams):
|
| 45 |
+
return df[df["Squad"].isin(teams)]
|
| 46 |
+
|
| 47 |
+
def main():
|
| 48 |
+
df = pull_premier_league_team_passing()
|
| 49 |
+
|
| 50 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 51 |
+
df_filtered = filter_teams(df, teams)
|
| 52 |
+
|
| 53 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 54 |
+
print("=" * 70)
|
| 55 |
+
# Menampilkan kolom yang relevan dari DataFrame yang sudah difilter
|
| 56 |
+
print(df_filtered[["Squad", "Total_Cmp", "Total_Att", "Total_Cmp%", "Total_TotDist"]])
|
| 57 |
+
|
| 58 |
+
# Bagian ini PENTING untuk menjalankan fungsi main()
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
main()
|
.history/fbrefdata_example_20251004180520.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
import random
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
def pull_premier_league_team_passing():
|
| 8 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 9 |
+
print(f"Downloading team passing stats from {url} ...")
|
| 10 |
+
|
| 11 |
+
# List of User-Agent strings
|
| 12 |
+
user_agents = [
|
| 13 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
| 14 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
|
| 15 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
# Randomly select a User-Agent
|
| 19 |
+
headers = {'User-Agent': random.choice(user_agents)}
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
response = requests.get(url, headers=headers)
|
| 23 |
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
| 24 |
+
except requests.exceptions.HTTPError as e:
|
| 25 |
+
print(f"HTTP error occurred: {e}")
|
| 26 |
+
return None
|
| 27 |
+
except requests.exceptions.RequestException as e:
|
| 28 |
+
print(f"An error occurred: {e}")
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
df = pd.read_html(StringIO(response.text))[0]
|
| 32 |
+
|
| 33 |
+
# Meratakan kolom
|
| 34 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 35 |
+
|
| 36 |
+
# Mengganti nama kolom yang aneh
|
| 37 |
+
df = df.rename(columns={
|
| 38 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 39 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 40 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 41 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 42 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 43 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 44 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 45 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 46 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 47 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 48 |
+
})
|
| 49 |
+
|
| 50 |
+
# Delay before returning (adjust as needed)
|
| 51 |
+
time.sleep(random.uniform(1, 3)) # Delay between 1 and 3 seconds
|
| 52 |
+
|
| 53 |
+
return df
|
| 54 |
+
|
| 55 |
+
def filter_teams(df, teams):
|
| 56 |
+
return df[df["Squad"].isin(teams)]
|
| 57 |
+
|
| 58 |
+
def main():
|
| 59 |
+
df = pull_premier_league_team_passing()
|
| 60 |
+
|
| 61 |
+
if df is not None:
|
| 62 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 63 |
+
df_filtered = filter_teams(df, teams)
|
| 64 |
+
|
| 65 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 66 |
+
print("=" * 70)
|
| 67 |
+
# Menampilkan kolom yang relevan dari DataFrame yang sudah difilter
|
| 68 |
+
print(df_filtered[["Squad", "Total_Cmp", "Total_Att", "Total_Cmp%", "Total_TotDist"]])
|
| 69 |
+
else:
|
| 70 |
+
print("Failed to retrieve data.")
|
| 71 |
+
|
| 72 |
+
# Bagian ini PENTING untuk menjalankan fungsi main()
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
main()
|
.history/fbrefdata_example_20251004180621.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 7 |
+
|
| 8 |
+
def pull_premier_league_team_passing():
|
| 9 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 10 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 11 |
+
|
| 12 |
+
# Inisialisasi driver Chrome secara otomatis
|
| 13 |
+
# Browser akan terbuka, mengambil data, lalu menutup sendiri
|
| 14 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
|
| 15 |
+
|
| 16 |
+
# Buka URL
|
| 17 |
+
driver.get(url)
|
| 18 |
+
|
| 19 |
+
# Beri waktu 3 detik agar halaman dan semua elemennya (termasuk tabel)
|
| 20 |
+
# termuat dengan sempurna
|
| 21 |
+
time.sleep(3)
|
| 22 |
+
|
| 23 |
+
# Ambil sumber HTML dari halaman yang sudah dimuat oleh browser
|
| 24 |
+
html_source = driver.page_source
|
| 25 |
+
|
| 26 |
+
# Tutup browser setelah selesai
|
| 27 |
+
driver.quit()
|
| 28 |
+
|
| 29 |
+
print("Data downloaded. Processing with pandas...")
|
| 30 |
+
|
| 31 |
+
# Sekarang kita proses HTML yang didapat dengan pandas, sama seperti sebelumnya
|
| 32 |
+
df = pd.read_html(StringIO(html_source))[0]
|
| 33 |
+
|
| 34 |
+
# Meratakan kolom
|
| 35 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 36 |
+
|
| 37 |
+
# Mengganti nama kolom yang aneh
|
| 38 |
+
df = df.rename(columns={
|
| 39 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 40 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 41 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 42 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 43 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 44 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 45 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 46 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 47 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 48 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
def filter_teams(df, teams):
|
| 54 |
+
return df[df["Squad"].isin(teams)]
|
| 55 |
+
|
| 56 |
+
def main():
|
| 57 |
+
df = pull_premier_league_team_passing()
|
| 58 |
+
|
| 59 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 60 |
+
df_filtered = filter_teams(df, teams)
|
| 61 |
+
|
| 62 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 63 |
+
print("=" * 70)
|
| 64 |
+
print(df_filtered[["Squad", "Total_Cmp", "Total_Att", "Total_Cmp%", "Total_TotDist"]])
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
main()
|
.history/fbrefdata_example_20251004184139.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
|
| 9 |
+
def pull_premier_league_team_passing():
|
| 10 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 11 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 12 |
+
|
| 13 |
+
# === BAGIAN BARU: Menambahkan Opsi Chrome ===
|
| 14 |
+
options = ChromeOptions()
|
| 15 |
+
options.add_argument("--start-maximized") # Memastikan jendela browser terbuka maksimal
|
| 16 |
+
options.add_argument("--no-sandbox") # Opsi ini seringkali diperlukan saat menjalankan di lingkungan otomatis
|
| 17 |
+
options.add_argument("--disable-dev-shm-usage") # Mengatasi masalah sumber daya yang terbatas
|
| 18 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # Menghilangkan notifikasi "Chrome is being controlled..."
|
| 19 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 20 |
+
# ============================================
|
| 21 |
+
|
| 22 |
+
# Inisialisasi driver Chrome dengan OPSI yang sudah kita buat
|
| 23 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 24 |
+
|
| 25 |
+
# Buka URL
|
| 26 |
+
driver.get(url)
|
| 27 |
+
|
| 28 |
+
# Beri waktu agar halaman termuat dengan sempurna
|
| 29 |
+
time.sleep(5) # Waktu tunggu sedikit diperpanjang menjadi 5 detik untuk amannya
|
| 30 |
+
|
| 31 |
+
# Ambil sumber HTML dari halaman
|
| 32 |
+
html_source = driver.page_source
|
| 33 |
+
|
| 34 |
+
# Tutup browser setelah selesai
|
| 35 |
+
driver.quit()
|
| 36 |
+
|
| 37 |
+
print("Data downloaded. Processing with pandas...")
|
| 38 |
+
|
| 39 |
+
df = pd.read_html(StringIO(html_source))[0]
|
| 40 |
+
|
| 41 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 42 |
+
|
| 43 |
+
df = df.rename(columns={
|
| 44 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 45 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 46 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 47 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 48 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 49 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 50 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 51 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 52 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 53 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 54 |
+
})
|
| 55 |
+
|
| 56 |
+
return df
|
| 57 |
+
|
| 58 |
+
def filter_teams(df, teams):
|
| 59 |
+
return df[df["Squad"].isin(teams)]
|
| 60 |
+
|
| 61 |
+
def main():
|
| 62 |
+
df = pull_premier_league_team_passing()
|
| 63 |
+
|
| 64 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 65 |
+
df_filtered = filter_teams(df, teams)
|
| 66 |
+
|
| 67 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 68 |
+
print("=" * 70)
|
| 69 |
+
print(df_filtered[["Squad", "Total_Cmp", "Total_Att", "Total_Cmp%", "Total_TotDist"]])
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
.history/fbrefdata_example_20251004185739.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
|
| 9 |
+
def pull_premier_league_team_passing():
|
| 10 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 11 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 12 |
+
|
| 13 |
+
options = ChromeOptions()
|
| 14 |
+
options.add_argument("--start-maximized")
|
| 15 |
+
options.add_argument("--no-sandbox")
|
| 16 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 17 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 18 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 19 |
+
|
| 20 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 21 |
+
|
| 22 |
+
driver.get(url)
|
| 23 |
+
time.sleep(5)
|
| 24 |
+
|
| 25 |
+
html_source = driver.page_source
|
| 26 |
+
driver.quit()
|
| 27 |
+
|
| 28 |
+
print("Data downloaded. Processing with pandas...")
|
| 29 |
+
|
| 30 |
+
df = pd.read_html(StringIO(html_source))[0]
|
| 31 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 32 |
+
|
| 33 |
+
# !!!!!!!!!! INI BAGIAN PENTING UNTUK DEBUG !!!!!!!!!!
|
| 34 |
+
print("\nDEBUG: Column names are:")
|
| 35 |
+
print(df.columns)
|
| 36 |
+
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
|
| 37 |
+
# !!!!!!!!!! AKHIR BAGIAN DEBUG !!!!!!!!!!
|
| 38 |
+
|
| 39 |
+
df = df.rename(columns={
|
| 40 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 41 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 42 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 43 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 44 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 45 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 46 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 47 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 48 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 49 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 50 |
+
})
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
def filter_teams(df, teams):
|
| 54 |
+
return df[df["Squad"].isin(teams)]
|
| 55 |
+
|
| 56 |
+
def main():
|
| 57 |
+
df = pull_premier_league_team_passing()
|
| 58 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 59 |
+
df_filtered = filter_teams(df, teams)
|
| 60 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 61 |
+
print("=" * 70)
|
| 62 |
+
print(df_filtered[["Squad", "Total_Cmp", "Total_Att", "Total_Cmp%", "Total_TotDist"]])
|
| 63 |
+
|
| 64 |
+
if __name__ == "__main__":
|
| 65 |
+
main()
|
.history/fbrefdata_example_20251004185920.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
|
| 9 |
+
def pull_premier_league_team_passing():
|
| 10 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 11 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 12 |
+
|
| 13 |
+
options = ChromeOptions()
|
| 14 |
+
options.add_argument("--start-maximized")
|
| 15 |
+
options.add_argument("--no-sandbox")
|
| 16 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 17 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 18 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 19 |
+
|
| 20 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 21 |
+
|
| 22 |
+
driver.get(url)
|
| 23 |
+
time.sleep(5)
|
| 24 |
+
|
| 25 |
+
html_source = driver.page_source
|
| 26 |
+
|
| 27 |
+
print("Data downloaded. Processing with pandas...")
|
| 28 |
+
|
| 29 |
+
# Specify the header rows
|
| 30 |
+
df = pd.read_html(StringIO(html_source), header=[0, 1])[0]
|
| 31 |
+
|
| 32 |
+
# Flatten the multi-level header
|
| 33 |
+
df.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 34 |
+
driver.quit()
|
| 35 |
+
|
| 36 |
+
# !!!!!!!!!! INI BAGIAN PENTING UNTUK DEBUG !!!!!!!!!!
|
| 37 |
+
print("\nDEBUG: Column names are:")
|
| 38 |
+
print(df.columns)
|
| 39 |
+
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
|
| 40 |
+
# !!!!!!!!!! AKHIR BAGIAN DEBUG !!!!!!!!!!
|
| 41 |
+
|
| 42 |
+
df = df.rename(columns={
|
| 43 |
+
"Unnamed: 0_level_0_Squad": "Squad",
|
| 44 |
+
"Unnamed: 1_level_0_# Pl": "Players",
|
| 45 |
+
"Unnamed: 2_level_0_90s": "90s",
|
| 46 |
+
"Unnamed: 17_level_0_Ast": "Ast",
|
| 47 |
+
"Unnamed: 18_level_0_xAG": "xAG",
|
| 48 |
+
"Unnamed: 21_level_0_KP": "KP",
|
| 49 |
+
"Unnamed: 22_level_0_1/3": "1/3",
|
| 50 |
+
"Unnamed: 23_level_0_PPA": "PPA",
|
| 51 |
+
"Unnamed: 24_level_0_CrsPA": "CrsPA",
|
| 52 |
+
"Unnamed: 25_level_0_PrgP": "PrgP"
|
| 53 |
+
})
|
| 54 |
+
return df
|
| 55 |
+
|
| 56 |
+
def filter_teams(df, teams):
|
| 57 |
+
return df[df["Squad"].isin(teams)]
|
| 58 |
+
|
| 59 |
+
def main():
|
| 60 |
+
df = pull_premier_league_team_passing()
|
| 61 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 62 |
+
df_filtered = filter_teams(df, teams)
|
| 63 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 64 |
+
print("=" * 70)
|
| 65 |
+
print(df_filtered[["Squad", "Total_Cmp", "Total_Att", "Total_Cmp%", "Total_TotDist"]])
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
main()
|
.history/fbrefdata_example_20251004190022.py
ADDED
|
File without changes
|
.history/fbrefdata_example_20251004190027.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
|
| 9 |
+
def pull_premier_league_team_passing():
|
| 10 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 11 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 12 |
+
|
| 13 |
+
options = ChromeOptions()
|
| 14 |
+
options.add_argument("--start-maximized")
|
| 15 |
+
options.add_argument("--no-sandbox")
|
| 16 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 17 |
+
options.add_argument("--headless") # Menjalankan browser di background agar tidak muncul jendela
|
| 18 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 19 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 20 |
+
|
| 21 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 22 |
+
|
| 23 |
+
driver.get(url)
|
| 24 |
+
time.sleep(3) # Cukup 3 detik jika headless
|
| 25 |
+
|
| 26 |
+
html_source = driver.page_source
|
| 27 |
+
driver.quit()
|
| 28 |
+
|
| 29 |
+
print("Data downloaded. Processing with pandas...")
|
| 30 |
+
|
| 31 |
+
# Ambil tabel pertama dari HTML
|
| 32 |
+
df = pd.read_html(StringIO(html_source))[0]
|
| 33 |
+
|
| 34 |
+
# ==============================================================================
|
| 35 |
+
# === BAGIAN LAMA DIHAPUS DAN DIGANTI DENGAN YANG LEBIH SEDERHANA INI ===
|
| 36 |
+
# ==============================================================================
|
| 37 |
+
# Berdasarkan struktur tabel di FBRef, kita tahu kolom yang kita mau ada di indeks:
|
| 38 |
+
# 1: Squad, 5: Total Cmp, 6: Total Att, 7: Total Cmp%, 8: Total TotDist
|
| 39 |
+
|
| 40 |
+
# 1. Pilih hanya kolom yang kita butuhkan berdasarkan nomor indeksnya
|
| 41 |
+
df = df[[1, 5, 6, 7, 8]]
|
| 42 |
+
|
| 43 |
+
# 2. Beri nama baru untuk kolom-kolom tersebut
|
| 44 |
+
df.columns = ['Squad', 'Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist']
|
| 45 |
+
|
| 46 |
+
# 3. Hapus baris terakhir yang biasanya berisi total/rata-rata liga
|
| 47 |
+
df = df.iloc[:-1]
|
| 48 |
+
# ==============================================================================
|
| 49 |
+
# ==============================================================================
|
| 50 |
+
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
def filter_teams(df, teams):
|
| 54 |
+
# Fungsi ini sekarang akan berhasil karena kolom 'Squad' sudah ada
|
| 55 |
+
return df[df["Squad"].isin(teams)]
|
| 56 |
+
|
| 57 |
+
def main():
|
| 58 |
+
df = pull_premier_league_team_passing()
|
| 59 |
+
|
| 60 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 61 |
+
df_filtered = filter_teams(df, teams)
|
| 62 |
+
|
| 63 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 64 |
+
print("=" * 70)
|
| 65 |
+
# Karena df_filtered sekarang hanya berisi kolom yang kita mau, kita bisa print langsung
|
| 66 |
+
print(df_filtered)
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
main()
|
.history/fbrefdata_example_20251004190339.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
# Imports baru untuk menunggu dengan cerdas
|
| 9 |
+
from selenium.webdriver.common.by import By
|
| 10 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 11 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 12 |
+
from selenium.common.exceptions import TimeoutException
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # Headless kita matikan dulu untuk debug, agar terlihat apa yang terjadi
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
# ==============================================================================
|
| 30 |
+
# === LOGIKA BARU: MENUNGGU CERDAS DAN INTERAKSI HALAMAN ===
|
| 31 |
+
# ==============================================================================
|
| 32 |
+
try:
|
| 33 |
+
# Tunggu max 10 detik sampai tombol cookie muncul, lalu klik
|
| 34 |
+
wait = WebDriverWait(driver, 10)
|
| 35 |
+
# Mencari tombol berdasarkan XPath yang berisi teks 'Accept All Cookies'
|
| 36 |
+
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 37 |
+
accept_button.click()
|
| 38 |
+
print("Cookie banner accepted.")
|
| 39 |
+
except TimeoutException:
|
| 40 |
+
# Jika tombol tidak muncul dalam 10 detik, anggap saja tidak ada banner
|
| 41 |
+
print("No cookie banner found or it took too long.")
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
# Sekarang, tunggu max 10 detik sampai tabelnya benar-benar muncul
|
| 45 |
+
wait = WebDriverWait(driver, 10)
|
| 46 |
+
# Kita tunggu sampai elemen div yang membungkus tabelnya terlihat
|
| 47 |
+
wait.until(EC.visibility_of_element_located((By.ID, "div_stats_passing")))
|
| 48 |
+
print("Stats table is now visible.")
|
| 49 |
+
except TimeoutException:
|
| 50 |
+
print("The stats table could not be found on the page.")
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None # Keluar dari fungsi jika tabel tidak ditemukan
|
| 53 |
+
# ==============================================================================
|
| 54 |
+
|
| 55 |
+
html_source = driver.page_source
|
| 56 |
+
driver.quit()
|
| 57 |
+
|
| 58 |
+
print("Data downloaded. Processing with pandas...")
|
| 59 |
+
|
| 60 |
+
df = pd.read_html(StringIO(html_source))[0]
|
| 61 |
+
|
| 62 |
+
df = df[[1, 5, 6, 7, 8]]
|
| 63 |
+
df.columns = ['Squad', 'Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist']
|
| 64 |
+
df = df.iloc[:-1]
|
| 65 |
+
|
| 66 |
+
return df
|
| 67 |
+
|
| 68 |
+
def filter_teams(df, teams):
|
| 69 |
+
return df[df["Squad"].isin(teams)]
|
| 70 |
+
|
| 71 |
+
def main():
|
| 72 |
+
df = pull_premier_league_team_passing()
|
| 73 |
+
# Pastikan df tidak None sebelum melanjutkan
|
| 74 |
+
if df is not None:
|
| 75 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 76 |
+
df_filtered = filter_teams(df, teams)
|
| 77 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 78 |
+
print("=" * 70)
|
| 79 |
+
print(df_filtered)
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
main()
|
.history/fbrefdata_example_20251004190507.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
def pull_premier_league_team_passing():
|
| 14 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 15 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 16 |
+
|
| 17 |
+
options = ChromeOptions()
|
| 18 |
+
options.add_argument("--start-maximized")
|
| 19 |
+
options.add_argument("--no-sandbox")
|
| 20 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 21 |
+
options.add_argument("--headless") # Kita nyalakan lagi headless agar cepat
|
| 22 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 23 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 24 |
+
|
| 25 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 26 |
+
driver.get(url)
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
wait = WebDriverWait(driver, 10)
|
| 30 |
+
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 31 |
+
accept_button.click()
|
| 32 |
+
print("Cookie banner accepted.")
|
| 33 |
+
except TimeoutException:
|
| 34 |
+
print("No cookie banner found or it took too long.")
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
wait = WebDriverWait(driver, 10)
|
| 38 |
+
wait.until(EC.visibility_of_element_located((By.ID, "div_stats_passing")))
|
| 39 |
+
print("Stats table is now visible.")
|
| 40 |
+
except TimeoutException:
|
| 41 |
+
print("The stats table could not be found on the page.")
|
| 42 |
+
driver.quit()
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
html_source = driver.page_source
|
| 46 |
+
driver.quit()
|
| 47 |
+
|
| 48 |
+
print("Data downloaded. Processing with pandas...")
|
| 49 |
+
|
| 50 |
+
# ==============================================================================
|
| 51 |
+
# === BAGIAN INVESTIGASI BARU ===
|
| 52 |
+
# ==============================================================================
|
| 53 |
+
# 1. Baca SEMUA tabel di halaman, jangan hanya ambil yang pertama [0]
|
| 54 |
+
all_tables = pd.read_html(StringIO(html_source))
|
| 55 |
+
print(f"\nDEBUG: Found {len(all_tables)} tables on the page.")
|
| 56 |
+
|
| 57 |
+
# 2. Cetak ukuran (baris, kolom) dari setiap tabel yang ditemukan
|
| 58 |
+
for i, table in enumerate(all_tables):
|
| 59 |
+
print(f"DEBUG: Table [{i}] has shape: {table.shape}")
|
| 60 |
+
|
| 61 |
+
# 3. Kita akan pilih tabel pertama untuk sementara agar bisa melihat output debug
|
| 62 |
+
# Ini akan menyebabkan error lagi, tapi itu tidak apa-apa.
|
| 63 |
+
df = all_tables[0]
|
| 64 |
+
# ==============================================================================
|
| 65 |
+
|
| 66 |
+
df = df[[1, 5, 6, 7, 8]]
|
| 67 |
+
df.columns = ['Squad', 'Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist']
|
| 68 |
+
df = df.iloc[:-1]
|
| 69 |
+
|
| 70 |
+
return df
|
| 71 |
+
|
| 72 |
+
def filter_teams(df, teams):
|
| 73 |
+
return df[df["Squad"].isin(teams)]
|
| 74 |
+
|
| 75 |
+
def main():
|
| 76 |
+
df = pull_premier_league_team_passing()
|
| 77 |
+
if df is not None:
|
| 78 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 79 |
+
df_filtered = filter_teams(df, teams)
|
| 80 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 81 |
+
print("=" * 70)
|
| 82 |
+
print(df_filtered)
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
.history/fbrefdata_example_20251004190633.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
def pull_premier_league_team_passing():
|
| 14 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 15 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 16 |
+
|
| 17 |
+
options = ChromeOptions()
|
| 18 |
+
options.add_argument("--start-maximized")
|
| 19 |
+
options.add_argument("--no-sandbox")
|
| 20 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 21 |
+
# options.add_argument("--headless") # Headless kita matikan agar bisa melihat prosesnya
|
| 22 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 23 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 24 |
+
|
| 25 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 26 |
+
driver.get(url)
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
wait = WebDriverWait(driver, 10)
|
| 30 |
+
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 31 |
+
accept_button.click()
|
| 32 |
+
print("Cookie banner accepted.")
|
| 33 |
+
except TimeoutException:
|
| 34 |
+
print("No cookie banner found or it took too long.")
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
wait = WebDriverWait(driver, 10)
|
| 38 |
+
wait.until(EC.visibility_of_element_located((By.ID, "div_stats_passing")))
|
| 39 |
+
print("Stats table is now visible.")
|
| 40 |
+
html_source = driver.page_source
|
| 41 |
+
df = pd.read_html(StringIO(html_source))[1] # Mengambil tabel kedua [1]
|
| 42 |
+
|
| 43 |
+
except TimeoutException:
|
| 44 |
+
print("The stats table could not be found on the page. Saving debug files...")
|
| 45 |
+
# ==============================================================================
|
| 46 |
+
# === BAGIAN DEBUG BARU: SIMPAN BUKTI KEGAGALAN ===
|
| 47 |
+
# ==============================================================================
|
| 48 |
+
# Simpan screenshot dari apa yang browser lihat
|
| 49 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 50 |
+
# Simpan kode HTML yang sedang ditampilkan
|
| 51 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 52 |
+
f.write(driver.page_source)
|
| 53 |
+
# ==============================================================================
|
| 54 |
+
driver.quit()
|
| 55 |
+
return None
|
| 56 |
+
|
| 57 |
+
driver.quit()
|
| 58 |
+
print("Data downloaded. Processing with pandas...")
|
| 59 |
+
|
| 60 |
+
# ... (sisa kode proses pandas) ...
|
| 61 |
+
# Saya juga melakukan perbaikan kecil berdasarkan investigasi sebelumnya,
|
| 62 |
+
# yaitu mencoba mengambil tabel kedua [1] bukan [0]
|
| 63 |
+
|
| 64 |
+
all_tables = pd.read_html(StringIO(html_source))
|
| 65 |
+
|
| 66 |
+
# Kita asumsikan tabel utama adalah yang paling banyak kolomnya
|
| 67 |
+
# Ini cara yang lebih cerdas untuk menemukan tabel yang benar
|
| 68 |
+
main_df = max(all_tables, key=lambda df: len(df.columns))
|
| 69 |
+
print(f"Main table selected with shape: {main_df.shape}")
|
| 70 |
+
|
| 71 |
+
df = main_df[[1, 5, 6, 7, 8]]
|
| 72 |
+
df.columns = ['Squad', 'Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist']
|
| 73 |
+
df = df.iloc[:-1]
|
| 74 |
+
|
| 75 |
+
return df
|
| 76 |
+
|
| 77 |
+
def filter_teams(df, teams):
|
| 78 |
+
return df[df["Squad"].isin(teams)]
|
| 79 |
+
|
| 80 |
+
def main():
|
| 81 |
+
df = pull_premier_league_team_passing()
|
| 82 |
+
if df is not None:
|
| 83 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 84 |
+
df_filtered = filter_teams(df, teams)
|
| 85 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 86 |
+
print("=" * 70)
|
| 87 |
+
print(df_filtered)
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
main()
|
.history/fbrefdata_example_20251004190944.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless")
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 32 |
+
accept_button.click()
|
| 33 |
+
print("Cookie banner accepted.")
|
| 34 |
+
except TimeoutException:
|
| 35 |
+
print("No cookie banner found or it took too long.")
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
wait = WebDriverWait(driver, 10)
|
| 39 |
+
wait.until(EC.visibility_of_element_located((By.ID, "div_stats_passing")))
|
| 40 |
+
print("Stats table is now visible.")
|
| 41 |
+
html_source = driver.page_source
|
| 42 |
+
all_tables = pd.read_html(StringIO(html_source))
|
| 43 |
+
except TimeoutException:
|
| 44 |
+
print("The stats table could not be found on the page. Saving debug files...")
|
| 45 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 46 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 47 |
+
f.write(driver.page_source)
|
| 48 |
+
driver.quit()
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
driver.quit()
|
| 52 |
+
print("Data downloaded. Processing with pandas...")
|
| 53 |
+
|
| 54 |
+
# Ambil tabel utama (yang paling banyak kolomnya)
|
| 55 |
+
main_df = max(all_tables, key=lambda df: len(df.columns))
|
| 56 |
+
print(f"Main table selected with shape: {main_df.shape}")
|
| 57 |
+
|
| 58 |
+
# Jika kolom multi-level (MultiIndex), kita gabungkan nama header-nya
|
| 59 |
+
if isinstance(main_df.columns, pd.MultiIndex):
|
| 60 |
+
main_df.columns = ['_'.join(col).strip() for col in main_df.columns.values]
|
| 61 |
+
|
| 62 |
+
# Coba tampilkan beberapa kolom agar tahu nama sebenarnya
|
| 63 |
+
print("Available columns:", main_df.columns[:10].tolist())
|
| 64 |
+
|
| 65 |
+
# Cari kolom yang relevan untuk passing
|
| 66 |
+
cols_to_use = [c for c in main_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
df = main_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom agar lebih rapi
|
| 70 |
+
df.columns = ['Squad', 'Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist']
|
| 71 |
+
df = df[df['Squad'].notna() & (df['Squad'] != 'Squad')] # hapus header duplikat
|
| 72 |
+
|
| 73 |
+
return df
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def filter_teams(df, teams):
|
| 77 |
+
return df[df["Squad"].isin(teams)]
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def main():
|
| 81 |
+
df = pull_premier_league_team_passing()
|
| 82 |
+
if df is not None:
|
| 83 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 84 |
+
df_filtered = filter_teams(df, teams)
|
| 85 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 86 |
+
print("=" * 70)
|
| 87 |
+
print(df_filtered)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
main()
|
.history/fbrefdata_example_20251004191947.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless")
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 32 |
+
accept_button.click()
|
| 33 |
+
print("Cookie banner accepted.")
|
| 34 |
+
except TimeoutException:
|
| 35 |
+
print("No cookie banner found or it took too long.")
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
wait = WebDriverWait(driver, 15)
|
| 39 |
+
wait.until(EC.visibility_of_element_located((By.ID, "stats_passing_team")))
|
| 40 |
+
print("Team stats table is visible.")
|
| 41 |
+
html_source = driver.page_source
|
| 42 |
+
except TimeoutException:
|
| 43 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 44 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 45 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 46 |
+
f.write(driver.page_source)
|
| 47 |
+
driver.quit()
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
driver.quit()
|
| 51 |
+
print("Data downloaded. Processing with pandas...")
|
| 52 |
+
|
| 53 |
+
# Ambil hanya tabel team passing
|
| 54 |
+
all_tables = pd.read_html(StringIO(html_source))
|
| 55 |
+
team_df = None
|
| 56 |
+
for df in all_tables:
|
| 57 |
+
if 'Squad' in df.columns:
|
| 58 |
+
team_df = df
|
| 59 |
+
break
|
| 60 |
+
|
| 61 |
+
if team_df is None:
|
| 62 |
+
print("β No team table found.")
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 66 |
+
|
| 67 |
+
# Bersihkan kolom header ganda jika ada
|
| 68 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 69 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 70 |
+
|
| 71 |
+
# Ambil kolom yang relevan
|
| 72 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 73 |
+
team_df = team_df[cols_to_use]
|
| 74 |
+
|
| 75 |
+
# Normalisasi nama kolom
|
| 76 |
+
rename_map = {}
|
| 77 |
+
for c in team_df.columns:
|
| 78 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 79 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 80 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 81 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 82 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 83 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 84 |
+
|
| 85 |
+
# Hapus baris duplikat atau NaN
|
| 86 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 87 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 88 |
+
|
| 89 |
+
return team_df
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def filter_teams(df, teams):
|
| 93 |
+
return df[df["Squad"].isin(teams)]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def main():
|
| 97 |
+
df = pull_premier_league_team_passing()
|
| 98 |
+
if df is not None:
|
| 99 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 100 |
+
df_filtered = filter_teams(df, teams)
|
| 101 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 102 |
+
print("=" * 70)
|
| 103 |
+
print(df_filtered)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
main()
|
.history/fbrefdata_example_20251005091604.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Arsenal", "Nott'ham Forest"]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005091825.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Wolves", "Nott'ham Forest"]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005091830.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Wolves", "B"]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005091835.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Wolves", "Brighton""]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005091839.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Wolves", "Brighton"]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Arsenal & Nottingham Forest (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005091854.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Wolves", "Brighton"]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Wolves (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005091857.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Wolves", "Brighton"]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Wolves & (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005091858.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
|
| 18 |
+
options = ChromeOptions()
|
| 19 |
+
options.add_argument("--start-maximized")
|
| 20 |
+
options.add_argument("--no-sandbox")
|
| 21 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 22 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 23 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 24 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 25 |
+
|
| 26 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 27 |
+
driver.get(url)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
wait = WebDriverWait(driver, 10)
|
| 31 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 32 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 33 |
+
accept_button.click()
|
| 34 |
+
print("Cookie banner accepted.")
|
| 35 |
+
except TimeoutException:
|
| 36 |
+
print("No cookie banner found or it took too long.")
|
| 37 |
+
|
| 38 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 39 |
+
try:
|
| 40 |
+
wait = WebDriverWait(driver, 20)
|
| 41 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 42 |
+
print("β
Team stats div found, extracting HTML...")
|
| 43 |
+
|
| 44 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 45 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 46 |
+
except TimeoutException:
|
| 47 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 48 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 49 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 50 |
+
f.write(driver.page_source)
|
| 51 |
+
driver.quit()
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
driver.quit()
|
| 55 |
+
print("Data downloaded. Processing with pandas...")
|
| 56 |
+
|
| 57 |
+
# Baca tabel dari potongan HTML
|
| 58 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 59 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 60 |
+
|
| 61 |
+
# Jika ada header dua baris, gabungkan
|
| 62 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 63 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 64 |
+
|
| 65 |
+
# Pilih kolom utama yang relevan
|
| 66 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 67 |
+
team_df = team_df[cols_to_use]
|
| 68 |
+
|
| 69 |
+
# Normalisasi nama kolom
|
| 70 |
+
rename_map = {}
|
| 71 |
+
for c in team_df.columns:
|
| 72 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 73 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 74 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 75 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 76 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 77 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 78 |
+
|
| 79 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 80 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 81 |
+
|
| 82 |
+
return team_df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def filter_teams(df, teams):
|
| 86 |
+
return df[df["Squad"].isin(teams)]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def main():
|
| 90 |
+
df = pull_premier_league_team_passing()
|
| 91 |
+
if df is not None:
|
| 92 |
+
# Simpan ke CSV otomatis
|
| 93 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 94 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 95 |
+
|
| 96 |
+
teams = ["Wolves", "Brighton"]
|
| 97 |
+
df_filtered = filter_teams(df, teams)
|
| 98 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print(df_filtered)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
main()
|
.history/fbrefdata_example_20251005092140.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
import time
|
| 18 |
+
time.sleep(5)
|
| 19 |
+
|
| 20 |
+
options = ChromeOptions()
|
| 21 |
+
options.add_argument("--start-maximized")
|
| 22 |
+
options.add_argument("--no-sandbox")
|
| 23 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 24 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 25 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 26 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 27 |
+
|
| 28 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 29 |
+
driver.get(url)
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
wait = WebDriverWait(driver, 10)
|
| 33 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 34 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 35 |
+
accept_button.click()
|
| 36 |
+
print("Cookie banner accepted.")
|
| 37 |
+
except TimeoutException:
|
| 38 |
+
print("No cookie banner found or it took too long.")
|
| 39 |
+
|
| 40 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 41 |
+
try:
|
| 42 |
+
wait = WebDriverWait(driver, 20)
|
| 43 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 44 |
+
print("β
Team stats div found, extracting HTML...")
|
| 45 |
+
|
| 46 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 47 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 48 |
+
except TimeoutException:
|
| 49 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 50 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 51 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 52 |
+
f.write(driver.page_source)
|
| 53 |
+
driver.quit()
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
driver.quit()
|
| 57 |
+
print("Data downloaded. Processing with pandas...")
|
| 58 |
+
|
| 59 |
+
# Baca tabel dari potongan HTML
|
| 60 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 61 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 62 |
+
|
| 63 |
+
# Jika ada header dua baris, gabungkan
|
| 64 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 65 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 66 |
+
|
| 67 |
+
# Pilih kolom utama yang relevan
|
| 68 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 69 |
+
team_df = team_df[cols_to_use]
|
| 70 |
+
|
| 71 |
+
# Normalisasi nama kolom
|
| 72 |
+
rename_map = {}
|
| 73 |
+
for c in team_df.columns:
|
| 74 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 75 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 76 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 77 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 78 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 79 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 80 |
+
|
| 81 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 82 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 83 |
+
|
| 84 |
+
return team_df
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def filter_teams(df, teams):
|
| 88 |
+
return df[df["Squad"].isin(teams)]
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main():
|
| 92 |
+
df = pull_premier_league_team_passing()
|
| 93 |
+
if df is not None:
|
| 94 |
+
# Simpan ke CSV otomatis
|
| 95 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 96 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 97 |
+
|
| 98 |
+
teams = ["Wolves", "Brighton"]
|
| 99 |
+
df_filtered = filter_teams(df, teams)
|
| 100 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 101 |
+
print("=" * 70)
|
| 102 |
+
print(df_filtered)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|
.history/fbrefdata_example_20251005092144.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
time.sleep(5)
|
| 18 |
+
|
| 19 |
+
options = ChromeOptions()
|
| 20 |
+
options.add_argument("--start-maximized")
|
| 21 |
+
options.add_argument("--no-sandbox")
|
| 22 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 23 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 24 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 25 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 26 |
+
|
| 27 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 28 |
+
driver.get(url)
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
wait = WebDriverWait(driver, 10)
|
| 32 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 33 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 34 |
+
accept_button.click()
|
| 35 |
+
print("Cookie banner accepted.")
|
| 36 |
+
except TimeoutException:
|
| 37 |
+
print("No cookie banner found or it took too long.")
|
| 38 |
+
|
| 39 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 40 |
+
try:
|
| 41 |
+
wait = WebDriverWait(driver, 20)
|
| 42 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 43 |
+
print("β
Team stats div found, extracting HTML...")
|
| 44 |
+
|
| 45 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 46 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 47 |
+
except TimeoutException:
|
| 48 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 49 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 50 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 51 |
+
f.write(driver.page_source)
|
| 52 |
+
driver.quit()
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
driver.quit()
|
| 56 |
+
print("Data downloaded. Processing with pandas...")
|
| 57 |
+
|
| 58 |
+
# Baca tabel dari potongan HTML
|
| 59 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 60 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 61 |
+
|
| 62 |
+
# Jika ada header dua baris, gabungkan
|
| 63 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 64 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 65 |
+
|
| 66 |
+
# Pilih kolom utama yang relevan
|
| 67 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 68 |
+
team_df = team_df[cols_to_use]
|
| 69 |
+
|
| 70 |
+
# Normalisasi nama kolom
|
| 71 |
+
rename_map = {}
|
| 72 |
+
for c in team_df.columns:
|
| 73 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 74 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 75 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 76 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 77 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 78 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 79 |
+
|
| 80 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 81 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 82 |
+
|
| 83 |
+
return team_df
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def filter_teams(df, teams):
|
| 87 |
+
return df[df["Squad"].isin(teams)]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def main():
|
| 91 |
+
df = pull_premier_league_team_passing()
|
| 92 |
+
if df is not None:
|
| 93 |
+
# Simpan ke CSV otomatis
|
| 94 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 95 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 96 |
+
|
| 97 |
+
teams = ["Wolves", "Brighton"]
|
| 98 |
+
df_filtered = filter_teams(df, teams)
|
| 99 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 100 |
+
print("=" * 70)
|
| 101 |
+
print(df_filtered)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
main()
|
.history/fbrefdata_example_20251005092150.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
time.sleep(5)
|
| 18 |
+
|
| 19 |
+
options = ChromeOptions()
|
| 20 |
+
options.add_argument("--start-maximized")
|
| 21 |
+
options.add_argument("--no-sandbox")
|
| 22 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 23 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 24 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 25 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 26 |
+
|
| 27 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 28 |
+
driver.get(url)
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
wait = WebDriverWait(driver, 10)
|
| 32 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 33 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 34 |
+
accept_button.click()
|
| 35 |
+
print("Cookie banner accepted.")
|
| 36 |
+
except TimeoutException:
|
| 37 |
+
print("No cookie banner found or it took too long.")
|
| 38 |
+
|
| 39 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 40 |
+
try:
|
| 41 |
+
wait = WebDriverWait(driver, 20)
|
| 42 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 43 |
+
print("β
Team stats div found, extracting HTML...")
|
| 44 |
+
|
| 45 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 46 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 47 |
+
except TimeoutException:
|
| 48 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 49 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 50 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 51 |
+
f.write(driver.page_source)
|
| 52 |
+
driver.quit()
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
driver.quit()
|
| 56 |
+
print("Data downloaded. Processing with pandas...")
|
| 57 |
+
|
| 58 |
+
# Baca tabel dari potongan HTML
|
| 59 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 60 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 61 |
+
|
| 62 |
+
# Jika ada header dua baris, gabungkan
|
| 63 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 64 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 65 |
+
|
| 66 |
+
# Pilih kolom utama yang relevan
|
| 67 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 68 |
+
team_df = team_df[cols_to_use]
|
| 69 |
+
|
| 70 |
+
# Normalisasi nama kolom
|
| 71 |
+
rename_map = {}
|
| 72 |
+
for c in team_df.columns:
|
| 73 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 74 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 75 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 76 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 77 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 78 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 79 |
+
|
| 80 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 81 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 82 |
+
|
| 83 |
+
return team_df
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def filter_teams(df, teams):
|
| 87 |
+
return df[df["Squad"].isin(teams)]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def main():
|
| 91 |
+
df = pull_premier_league_team_passing()
|
| 92 |
+
if df is not None:
|
| 93 |
+
# Simpan ke CSV otomatis
|
| 94 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 95 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 96 |
+
|
| 97 |
+
teams = ["Wolves", "Brighton"]
|
| 98 |
+
df_filtered = filter_teams(df, teams)
|
| 99 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 100 |
+
print("=" * 70)
|
| 101 |
+
print(df_filtered)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
main()
|
.history/fbrefdata_example_20251005092800.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "url = "https://fbref.com/en/comps/9/teams/Premier-League-Stats"
|
| 16 |
+
"
|
| 17 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 18 |
+
time.sleep(5)
|
| 19 |
+
|
| 20 |
+
options = ChromeOptions()
|
| 21 |
+
options.add_argument("--start-maximized")
|
| 22 |
+
options.add_argument("--no-sandbox")
|
| 23 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 24 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 25 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 26 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 27 |
+
|
| 28 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 29 |
+
driver.get(url)
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
wait = WebDriverWait(driver, 10)
|
| 33 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 34 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 35 |
+
accept_button.click()
|
| 36 |
+
print("Cookie banner accepted.")
|
| 37 |
+
except TimeoutException:
|
| 38 |
+
print("No cookie banner found or it took too long.")
|
| 39 |
+
|
| 40 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 41 |
+
try:
|
| 42 |
+
wait = WebDriverWait(driver, 20)
|
| 43 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 44 |
+
print("β
Team stats div found, extracting HTML...")
|
| 45 |
+
|
| 46 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 47 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 48 |
+
except TimeoutException:
|
| 49 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 50 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 51 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 52 |
+
f.write(driver.page_source)
|
| 53 |
+
driver.quit()
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
driver.quit()
|
| 57 |
+
print("Data downloaded. Processing with pandas...")
|
| 58 |
+
|
| 59 |
+
# Baca tabel dari potongan HTML
|
| 60 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 61 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 62 |
+
|
| 63 |
+
# Jika ada header dua baris, gabungkan
|
| 64 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 65 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 66 |
+
|
| 67 |
+
# Pilih kolom utama yang relevan
|
| 68 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 69 |
+
team_df = team_df[cols_to_use]
|
| 70 |
+
|
| 71 |
+
# Normalisasi nama kolom
|
| 72 |
+
rename_map = {}
|
| 73 |
+
for c in team_df.columns:
|
| 74 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 75 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 76 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 77 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 78 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 79 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 80 |
+
|
| 81 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 82 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 83 |
+
|
| 84 |
+
return team_df
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def filter_teams(df, teams):
|
| 88 |
+
return df[df["Squad"].isin(teams)]
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main():
|
| 92 |
+
df = pull_premier_league_team_passing()
|
| 93 |
+
if df is not None:
|
| 94 |
+
# Simpan ke CSV otomatis
|
| 95 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 96 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 97 |
+
|
| 98 |
+
teams = ["Wolves", "Brighton"]
|
| 99 |
+
df_filtered = filter_teams(df, teams)
|
| 100 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 101 |
+
print("=" * 70)
|
| 102 |
+
print(df_filtered)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|
.history/fbrefdata_example_20251005092803.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "url = "https://fbref.com/en/comps/9/teams/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
time.sleep(5)
|
| 18 |
+
|
| 19 |
+
options = ChromeOptions()
|
| 20 |
+
options.add_argument("--start-maximized")
|
| 21 |
+
options.add_argument("--no-sandbox")
|
| 22 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 23 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 24 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 25 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 26 |
+
|
| 27 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 28 |
+
driver.get(url)
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
wait = WebDriverWait(driver, 10)
|
| 32 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 33 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 34 |
+
accept_button.click()
|
| 35 |
+
print("Cookie banner accepted.")
|
| 36 |
+
except TimeoutException:
|
| 37 |
+
print("No cookie banner found or it took too long.")
|
| 38 |
+
|
| 39 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 40 |
+
try:
|
| 41 |
+
wait = WebDriverWait(driver, 20)
|
| 42 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 43 |
+
print("β
Team stats div found, extracting HTML...")
|
| 44 |
+
|
| 45 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 46 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 47 |
+
except TimeoutException:
|
| 48 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 49 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 50 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 51 |
+
f.write(driver.page_source)
|
| 52 |
+
driver.quit()
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
driver.quit()
|
| 56 |
+
print("Data downloaded. Processing with pandas...")
|
| 57 |
+
|
| 58 |
+
# Baca tabel dari potongan HTML
|
| 59 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 60 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 61 |
+
|
| 62 |
+
# Jika ada header dua baris, gabungkan
|
| 63 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 64 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 65 |
+
|
| 66 |
+
# Pilih kolom utama yang relevan
|
| 67 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 68 |
+
team_df = team_df[cols_to_use]
|
| 69 |
+
|
| 70 |
+
# Normalisasi nama kolom
|
| 71 |
+
rename_map = {}
|
| 72 |
+
for c in team_df.columns:
|
| 73 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 74 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 75 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 76 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 77 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 78 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 79 |
+
|
| 80 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 81 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 82 |
+
|
| 83 |
+
return team_df
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def filter_teams(df, teams):
|
| 87 |
+
return df[df["Squad"].isin(teams)]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def main():
|
| 91 |
+
df = pull_premier_league_team_passing()
|
| 92 |
+
if df is not None:
|
| 93 |
+
# Simpan ke CSV otomatis
|
| 94 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 95 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 96 |
+
|
| 97 |
+
teams = ["Wolves", "Brighton"]
|
| 98 |
+
df_filtered = filter_teams(df, teams)
|
| 99 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 100 |
+
print("=" * 70)
|
| 101 |
+
print(df_filtered)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
main()
|
.history/fbrefdata_example_20251005092809.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
time.sleep(5)
|
| 18 |
+
|
| 19 |
+
options = ChromeOptions()
|
| 20 |
+
options.add_argument("--start-maximized")
|
| 21 |
+
options.add_argument("--no-sandbox")
|
| 22 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 23 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 24 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 25 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 26 |
+
|
| 27 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 28 |
+
driver.get(url)
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
wait = WebDriverWait(driver, 10)
|
| 32 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 33 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 34 |
+
accept_button.click()
|
| 35 |
+
print("Cookie banner accepted.")
|
| 36 |
+
except TimeoutException:
|
| 37 |
+
print("No cookie banner found or it took too long.")
|
| 38 |
+
|
| 39 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 40 |
+
try:
|
| 41 |
+
wait = WebDriverWait(driver, 20)
|
| 42 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 43 |
+
print("β
Team stats div found, extracting HTML...")
|
| 44 |
+
|
| 45 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 46 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 47 |
+
except TimeoutException:
|
| 48 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 49 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 50 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 51 |
+
f.write(driver.page_source)
|
| 52 |
+
driver.quit()
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
driver.quit()
|
| 56 |
+
print("Data downloaded. Processing with pandas...")
|
| 57 |
+
|
| 58 |
+
# Baca tabel dari potongan HTML
|
| 59 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 60 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 61 |
+
|
| 62 |
+
# Jika ada header dua baris, gabungkan
|
| 63 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 64 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 65 |
+
|
| 66 |
+
# Pilih kolom utama yang relevan
|
| 67 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 68 |
+
team_df = team_df[cols_to_use]
|
| 69 |
+
|
| 70 |
+
# Normalisasi nama kolom
|
| 71 |
+
rename_map = {}
|
| 72 |
+
for c in team_df.columns:
|
| 73 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 74 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 75 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 76 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 77 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 78 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 79 |
+
|
| 80 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 81 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 82 |
+
|
| 83 |
+
return team_df
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def filter_teams(df, teams):
|
| 87 |
+
return df[df["Squad"].isin(teams)]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def main():
|
| 91 |
+
df = pull_premier_league_team_passing()
|
| 92 |
+
if df is not None:
|
| 93 |
+
# Simpan ke CSV otomatis
|
| 94 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 95 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 96 |
+
|
| 97 |
+
teams = ["Wolves", "Brighton"]
|
| 98 |
+
df_filtered = filter_teams(df, teams)
|
| 99 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 100 |
+
print("=" * 70)
|
| 101 |
+
print(df_filtered)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
main()
|
.history/fbrefdata_example_20251005092817.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/teams/Premier-League-Stats"
|
| 16 |
+
|
| 17 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 18 |
+
time.sleep(5)
|
| 19 |
+
|
| 20 |
+
options = ChromeOptions()
|
| 21 |
+
options.add_argument("--start-maximized")
|
| 22 |
+
options.add_argument("--no-sandbox")
|
| 23 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 24 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 25 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 26 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 27 |
+
|
| 28 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 29 |
+
driver.get(url)
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
wait = WebDriverWait(driver, 10)
|
| 33 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 34 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 35 |
+
accept_button.click()
|
| 36 |
+
print("Cookie banner accepted.")
|
| 37 |
+
except TimeoutException:
|
| 38 |
+
print("No cookie banner found or it took too long.")
|
| 39 |
+
|
| 40 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 41 |
+
try:
|
| 42 |
+
wait = WebDriverWait(driver, 20)
|
| 43 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 44 |
+
print("β
Team stats div found, extracting HTML...")
|
| 45 |
+
|
| 46 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 47 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 48 |
+
except TimeoutException:
|
| 49 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 50 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 51 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 52 |
+
f.write(driver.page_source)
|
| 53 |
+
driver.quit()
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
driver.quit()
|
| 57 |
+
print("Data downloaded. Processing with pandas...")
|
| 58 |
+
|
| 59 |
+
# Baca tabel dari potongan HTML
|
| 60 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 61 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 62 |
+
|
| 63 |
+
# Jika ada header dua baris, gabungkan
|
| 64 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 65 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 66 |
+
|
| 67 |
+
# Pilih kolom utama yang relevan
|
| 68 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 69 |
+
team_df = team_df[cols_to_use]
|
| 70 |
+
|
| 71 |
+
# Normalisasi nama kolom
|
| 72 |
+
rename_map = {}
|
| 73 |
+
for c in team_df.columns:
|
| 74 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 75 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 76 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 77 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 78 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 79 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 80 |
+
|
| 81 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 82 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 83 |
+
|
| 84 |
+
return team_df
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def filter_teams(df, teams):
|
| 88 |
+
return df[df["Squad"].isin(teams)]
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main():
|
| 92 |
+
df = pull_premier_league_team_passing()
|
| 93 |
+
if df is not None:
|
| 94 |
+
# Simpan ke CSV otomatis
|
| 95 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 96 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 97 |
+
|
| 98 |
+
teams = ["Wolves", "Brighton"]
|
| 99 |
+
df_filtered = filter_teams(df, teams)
|
| 100 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 101 |
+
print("=" * 70)
|
| 102 |
+
print(df_filtered)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|
.history/fbrefdata_example_20251005092820.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/teams/Premier-League-Stats"
|
| 16 |
+
|
| 17 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 18 |
+
time.sleep(5)
|
| 19 |
+
|
| 20 |
+
options = ChromeOptions()
|
| 21 |
+
options.add_argument("--start-maximized")
|
| 22 |
+
options.add_argument("--no-sandbox")
|
| 23 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 24 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 25 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 26 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 27 |
+
|
| 28 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 29 |
+
driver.get(url)
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
wait = WebDriverWait(driver, 10)
|
| 33 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 34 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 35 |
+
accept_button.click()
|
| 36 |
+
print("Cookie banner accepted.")
|
| 37 |
+
except TimeoutException:
|
| 38 |
+
print("No cookie banner found or it took too long.")
|
| 39 |
+
|
| 40 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 41 |
+
try:
|
| 42 |
+
wait = WebDriverWait(driver, 20)
|
| 43 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 44 |
+
print("β
Team stats div found, extracting HTML...")
|
| 45 |
+
|
| 46 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 47 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 48 |
+
except TimeoutException:
|
| 49 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 50 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 51 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 52 |
+
f.write(driver.page_source)
|
| 53 |
+
driver.quit()
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
driver.quit()
|
| 57 |
+
print("Data downloaded. Processing with pandas...")
|
| 58 |
+
|
| 59 |
+
# Baca tabel dari potongan HTML
|
| 60 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 61 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 62 |
+
|
| 63 |
+
# Jika ada header dua baris, gabungkan
|
| 64 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 65 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 66 |
+
|
| 67 |
+
# Pilih kolom utama yang relevan
|
| 68 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 69 |
+
team_df = team_df[cols_to_use]
|
| 70 |
+
|
| 71 |
+
# Normalisasi nama kolom
|
| 72 |
+
rename_map = {}
|
| 73 |
+
for c in team_df.columns:
|
| 74 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 75 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 76 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 77 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 78 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 79 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 80 |
+
|
| 81 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 82 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 83 |
+
|
| 84 |
+
return team_df
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def filter_teams(df, teams):
|
| 88 |
+
return df[df["Squad"].isin(teams)]
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main():
|
| 92 |
+
df = pull_premier_league_team_passing()
|
| 93 |
+
if df is not None:
|
| 94 |
+
# Simpan ke CSV otomatis
|
| 95 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 96 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 97 |
+
|
| 98 |
+
teams = ["Wolves", "Brighton"]
|
| 99 |
+
df_filtered = filter_teams(df, teams)
|
| 100 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 101 |
+
print("=" * 70)
|
| 102 |
+
print(df_filtered)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|
.history/fbrefdata_example_20251005092822.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_team_passing():
|
| 15 |
+
url = "https://fbref.com/en/comps/9/teams/Premier-League-Stats"
|
| 16 |
+
print(f"Opening browser to download team passing stats from {url} ...")
|
| 17 |
+
time.sleep(5)
|
| 18 |
+
|
| 19 |
+
options = ChromeOptions()
|
| 20 |
+
options.add_argument("--start-maximized")
|
| 21 |
+
options.add_argument("--no-sandbox")
|
| 22 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 23 |
+
# options.add_argument("--headless") # bisa diaktifkan jika tidak perlu melihat browser
|
| 24 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 25 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 26 |
+
|
| 27 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 28 |
+
driver.get(url)
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
wait = WebDriverWait(driver, 10)
|
| 32 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 33 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")))
|
| 34 |
+
accept_button.click()
|
| 35 |
+
print("Cookie banner accepted.")
|
| 36 |
+
except TimeoutException:
|
| 37 |
+
print("No cookie banner found or it took too long.")
|
| 38 |
+
|
| 39 |
+
# β
Tunggu elemen tabel tim muncul (div wrapper)
|
| 40 |
+
try:
|
| 41 |
+
wait = WebDriverWait(driver, 20)
|
| 42 |
+
div_element = wait.until(EC.presence_of_element_located((By.ID, "div_stats_passing_team")))
|
| 43 |
+
print("β
Team stats div found, extracting HTML...")
|
| 44 |
+
|
| 45 |
+
# Ambil HTML hanya bagian tabel team passing
|
| 46 |
+
team_html = div_element.get_attribute("outerHTML")
|
| 47 |
+
except TimeoutException:
|
| 48 |
+
print("β The team stats table could not be found on the page. Saving debug files...")
|
| 49 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 50 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 51 |
+
f.write(driver.page_source)
|
| 52 |
+
driver.quit()
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
driver.quit()
|
| 56 |
+
print("Data downloaded. Processing with pandas...")
|
| 57 |
+
|
| 58 |
+
# Baca tabel dari potongan HTML
|
| 59 |
+
team_df = pd.read_html(StringIO(team_html))[0]
|
| 60 |
+
print(f"β
Found team table with shape: {team_df.shape}")
|
| 61 |
+
|
| 62 |
+
# Jika ada header dua baris, gabungkan
|
| 63 |
+
if isinstance(team_df.columns, pd.MultiIndex):
|
| 64 |
+
team_df.columns = ['_'.join(col).strip() for col in team_df.columns.values]
|
| 65 |
+
|
| 66 |
+
# Pilih kolom utama yang relevan
|
| 67 |
+
cols_to_use = [c for c in team_df.columns if any(x in c for x in ['Squad', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 68 |
+
team_df = team_df[cols_to_use]
|
| 69 |
+
|
| 70 |
+
# Normalisasi nama kolom
|
| 71 |
+
rename_map = {}
|
| 72 |
+
for c in team_df.columns:
|
| 73 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 74 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 75 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 76 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 77 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 78 |
+
team_df.rename(columns=rename_map, inplace=True)
|
| 79 |
+
|
| 80 |
+
team_df = team_df[team_df['Squad'].notna()]
|
| 81 |
+
team_df = team_df[~team_df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 82 |
+
|
| 83 |
+
return team_df
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def filter_teams(df, teams):
|
| 87 |
+
return df[df["Squad"].isin(teams)]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def main():
|
| 91 |
+
df = pull_premier_league_team_passing()
|
| 92 |
+
if df is not None:
|
| 93 |
+
# Simpan ke CSV otomatis
|
| 94 |
+
df.to_csv("premier_league_team_passing.csv", index=False)
|
| 95 |
+
print("\nπΎ Saved to premier_league_team_passing.csv")
|
| 96 |
+
|
| 97 |
+
teams = ["Wolves", "Brighton"]
|
| 98 |
+
df_filtered = filter_teams(df, teams)
|
| 99 |
+
print("\nπ Passing Stats for Wolves & Brighton (Team Level)")
|
| 100 |
+
print("=" * 70)
|
| 101 |
+
print(df_filtered)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
main()
|
.history/fbrefdata_example_20251005092904.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_passing():
|
| 15 |
+
"""
|
| 16 |
+
Ambil data passing (otomatis deteksi: tim atau pemain)
|
| 17 |
+
dari halaman FBref Premier League terbaru.
|
| 18 |
+
"""
|
| 19 |
+
# URL utama
|
| 20 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 21 |
+
print(f"π Opening browser to download passing stats from {url} ...")
|
| 22 |
+
|
| 23 |
+
# --- Setup browser Chrome ---
|
| 24 |
+
options = ChromeOptions()
|
| 25 |
+
options.add_argument("--start-maximized")
|
| 26 |
+
options.add_argument("--no-sandbox")
|
| 27 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 28 |
+
# options.add_argument("--headless") # aktifkan jika ingin tanpa tampilan browser
|
| 29 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 30 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 31 |
+
|
| 32 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 33 |
+
driver.get(url)
|
| 34 |
+
|
| 35 |
+
# --- Handle cookie banner (jika muncul) ---
|
| 36 |
+
try:
|
| 37 |
+
wait = WebDriverWait(driver, 10)
|
| 38 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 39 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")
|
| 40 |
+
))
|
| 41 |
+
accept_button.click()
|
| 42 |
+
print("πͺ Cookie banner accepted.")
|
| 43 |
+
except TimeoutException:
|
| 44 |
+
print("No cookie banner found or it took too long.")
|
| 45 |
+
|
| 46 |
+
# --- Coba deteksi tabel TIM terlebih dahulu ---
|
| 47 |
+
table_html = None
|
| 48 |
+
try:
|
| 49 |
+
wait = WebDriverWait(driver, 15)
|
| 50 |
+
div_team = wait.until(EC.presence_of_element_located((By.ID, "all_stats_passing_team")))
|
| 51 |
+
print("β
Team passing table found.")
|
| 52 |
+
table_html = div_team.get_attribute("outerHTML")
|
| 53 |
+
table_type = "team"
|
| 54 |
+
except TimeoutException:
|
| 55 |
+
print("β οΈ Team passing table not found. Trying player table...")
|
| 56 |
+
|
| 57 |
+
# --- Fallback ke tabel pemain ---
|
| 58 |
+
try:
|
| 59 |
+
div_player = wait.until(EC.presence_of_element_located((By.ID, "all_stats_passing")))
|
| 60 |
+
print("β
Player passing table found.")
|
| 61 |
+
table_html = div_player.get_attribute("outerHTML")
|
| 62 |
+
table_type = "player"
|
| 63 |
+
except TimeoutException:
|
| 64 |
+
print("β No passing table found at all. Saving debug files...")
|
| 65 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 66 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 67 |
+
f.write(driver.page_source)
|
| 68 |
+
driver.quit()
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
driver.quit()
|
| 72 |
+
print("π Data downloaded. Processing with pandas...")
|
| 73 |
+
|
| 74 |
+
# --- Parse HTML table ke DataFrame ---
|
| 75 |
+
df = pd.read_html(StringIO(table_html))[0]
|
| 76 |
+
print(f"β
Table found with shape: {df.shape}")
|
| 77 |
+
|
| 78 |
+
# Gabungkan header dua baris (jika ada)
|
| 79 |
+
if isinstance(df.columns, pd.MultiIndex):
|
| 80 |
+
df.columns = ['_'.join(col).strip() for col in df.columns.values]
|
| 81 |
+
|
| 82 |
+
# Pilih kolom relevan
|
| 83 |
+
cols_to_use = [c for c in df.columns if any(x in c for x in ['Squad', 'Player', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 84 |
+
df = df[cols_to_use]
|
| 85 |
+
|
| 86 |
+
# Normalisasi nama kolom
|
| 87 |
+
rename_map = {}
|
| 88 |
+
for c in df.columns:
|
| 89 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 90 |
+
elif 'Player' in c: rename_map[c] = 'Player'
|
| 91 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 92 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 93 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 94 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 95 |
+
df.rename(columns=rename_map, inplace=True)
|
| 96 |
+
|
| 97 |
+
# Bersihkan baris kosong / header duplikat
|
| 98 |
+
if 'Squad' in df.columns:
|
| 99 |
+
df = df[df['Squad'].notna()]
|
| 100 |
+
df = df[~df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 101 |
+
|
| 102 |
+
print(f"β
Cleaned dataframe shape: {df.shape}")
|
| 103 |
+
return df, table_type
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def filter_teams(df, teams):
|
| 107 |
+
"""Filter baris berdasarkan nama tim"""
|
| 108 |
+
if "Squad" not in df.columns:
|
| 109 |
+
print("β οΈ 'Squad' column not found, skipping team filter.")
|
| 110 |
+
return df
|
| 111 |
+
return df[df["Squad"].isin(teams)]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def main():
|
| 115 |
+
df, table_type = pull_premier_league_passing()
|
| 116 |
+
if df is not None:
|
| 117 |
+
# Simpan hasil
|
| 118 |
+
filename = f"premier_league_{table_type}_passing.csv"
|
| 119 |
+
df.to_csv(filename, index=False)
|
| 120 |
+
print(f"\nπΎ Saved to {filename}")
|
| 121 |
+
|
| 122 |
+
# Filter contoh tim
|
| 123 |
+
teams = ["Arsenal", "Wolves", "Brighton"]
|
| 124 |
+
df_filtered = filter_teams(df, teams)
|
| 125 |
+
print(f"\nπ Passing Stats ({table_type.title()} Level) for selected teams")
|
| 126 |
+
print("=" * 80)
|
| 127 |
+
print(df_filtered.head())
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
main()
|
.history/fbrefdata_example_20251005093119.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup, Comment
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
# === 1. URL target (Premier League Passing Stats terbaru) ===
|
| 7 |
+
URL = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 8 |
+
|
| 9 |
+
print(f"π‘ Mengambil data dari {URL} ...")
|
| 10 |
+
|
| 11 |
+
# === 2. Ambil HTML page ===
|
| 12 |
+
headers = {
|
| 13 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 14 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 15 |
+
"Chrome/120.0.0.0 Safari/537.36"
|
| 16 |
+
}
|
| 17 |
+
response = requests.get(URL, headers=headers)
|
| 18 |
+
|
| 19 |
+
if response.status_code != 200:
|
| 20 |
+
raise Exception(f"Gagal mengunduh halaman (status code {response.status_code})")
|
| 21 |
+
|
| 22 |
+
html = response.text
|
| 23 |
+
|
| 24 |
+
# === 3. Tangani tabel yang tersembunyi dalam komentar HTML ===
|
| 25 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 26 |
+
|
| 27 |
+
# FBref sering menyembunyikan tabel di dalam komentar <!-- ... -->
|
| 28 |
+
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
|
| 29 |
+
passing_table_html = None
|
| 30 |
+
|
| 31 |
+
for c in comments:
|
| 32 |
+
if 'table' in c and 'passing' in c:
|
| 33 |
+
if 'id="stats_passing' in c:
|
| 34 |
+
passing_table_html = c
|
| 35 |
+
break
|
| 36 |
+
|
| 37 |
+
if not passing_table_html:
|
| 38 |
+
raise Exception("β Tabel passing tidak ditemukan. Mungkin struktur halaman berubah.")
|
| 39 |
+
|
| 40 |
+
# === 4. Parse tabel dari komentar ===
|
| 41 |
+
passing_soup = BeautifulSoup(passing_table_html, "html.parser")
|
| 42 |
+
table = passing_soup.find("table")
|
| 43 |
+
|
| 44 |
+
if table is None:
|
| 45 |
+
raise Exception("β Tidak bisa mem-parse tabel dari komentar HTML.")
|
| 46 |
+
|
| 47 |
+
# === 5. Konversi ke DataFrame ===
|
| 48 |
+
df = pd.read_html(str(table))[0]
|
| 49 |
+
|
| 50 |
+
# === 6. Bersihkan kolom ===
|
| 51 |
+
df.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 52 |
+
df = df.dropna(how='all') # hapus baris kosong
|
| 53 |
+
|
| 54 |
+
# === 7. Simpan ke CSV ===
|
| 55 |
+
csv_name = "premier_league_passing_2025.csv"
|
| 56 |
+
df.to_csv(csv_name, index=False)
|
| 57 |
+
print(f"β
Data berhasil diunduh dan disimpan ke {csv_name}")
|
| 58 |
+
|
| 59 |
+
# === 8. Tampilkan preview ===
|
| 60 |
+
print("\n=== Preview Data ===")
|
| 61 |
+
print(df.head(10))
|
.history/fbrefdata_example_20251005093129.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup, Comment
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
# === 1. URL target (Premier League Passing Stats terbaru) ===
|
| 7 |
+
URL = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 8 |
+
|
| 9 |
+
print(f"π‘ Mengambil data dari {URL} ...")
|
| 10 |
+
|
| 11 |
+
# === 2. Ambil HTML page ===
|
| 12 |
+
headers = {
|
| 13 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 14 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 15 |
+
"Chrome/120.0.0.0 Safari/537.36"
|
| 16 |
+
}
|
| 17 |
+
response = requests.get(URL, headers=headers)
|
| 18 |
+
|
| 19 |
+
if response.status_code != 200:
|
| 20 |
+
raise Exception(f"Gagal mengunduh halaman (status code {response.status_code})")
|
| 21 |
+
|
| 22 |
+
html = response.text
|
| 23 |
+
|
| 24 |
+
# === 3. Tangani tabel yang tersembunyi dalam komentar HTML ===
|
| 25 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 26 |
+
|
| 27 |
+
# FBref sering menyembunyikan tabel di dalam komentar <!-- ... -->
|
| 28 |
+
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
|
| 29 |
+
passing_table_html = None
|
| 30 |
+
|
| 31 |
+
for c in comments:
|
| 32 |
+
if 'table' in c and 'passing' in c:
|
| 33 |
+
if 'id="stats_passing' in c:
|
| 34 |
+
passing_table_html = c
|
| 35 |
+
break
|
| 36 |
+
|
| 37 |
+
if not passing_table_html:
|
| 38 |
+
raise Exception("β Tabel passing tidak ditemukan. Mungkin struktur halaman berubah.")
|
| 39 |
+
|
| 40 |
+
# === 4. Parse tabel dari komentar ===
|
| 41 |
+
passing_soup = BeautifulSoup(passing_table_html, "html.parser")
|
| 42 |
+
table = passing_soup.find("table")
|
| 43 |
+
|
| 44 |
+
if table is None:
|
| 45 |
+
raise Exception("β Tidak bisa mem-parse tabel dari komentar HTML.")
|
| 46 |
+
|
| 47 |
+
# === 5. Konversi ke DataFrame ===
|
| 48 |
+
df = pd.read_html(str(table))[0]
|
| 49 |
+
|
| 50 |
+
# === 6. Bersihkan kolom ===
|
| 51 |
+
df.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
|
| 52 |
+
df = df.dropna(how='all') # hapus baris kosong
|
| 53 |
+
|
| 54 |
+
# === 7. Simpan ke CSV ===
|
| 55 |
+
csv_name = "premier_league_passing_2025.csv"
|
| 56 |
+
df.to_csv(csv_name, index=False)
|
| 57 |
+
print(f"β
Data berhasil diunduh dan disimpan ke {csv_name}")
|
| 58 |
+
|
| 59 |
+
# === 8. Tampilkan preview ===
|
| 60 |
+
print("\n=== Preview Data ===")
|
| 61 |
+
print(df.head(10))
|
.history/fbrefdata_example_20251005093230.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 7 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import TimeoutException
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def pull_premier_league_passing():
|
| 15 |
+
"""
|
| 16 |
+
Ambil data passing (otomatis deteksi: tim atau pemain)
|
| 17 |
+
dari halaman FBref Premier League terbaru.
|
| 18 |
+
"""
|
| 19 |
+
# URL utama
|
| 20 |
+
url = "https://fbref.com/en/comps/9/passing/Premier-League-Stats"
|
| 21 |
+
print(f"π Opening browser to download passing stats from {url} ...")
|
| 22 |
+
|
| 23 |
+
# --- Setup browser Chrome ---
|
| 24 |
+
options = ChromeOptions()
|
| 25 |
+
options.add_argument("--start-maximized")
|
| 26 |
+
options.add_argument("--no-sandbox")
|
| 27 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 28 |
+
# options.add_argument("--headless") # aktifkan jika ingin tanpa tampilan browser
|
| 29 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 30 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 31 |
+
|
| 32 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 33 |
+
driver.get(url)
|
| 34 |
+
|
| 35 |
+
# --- Handle cookie banner (jika muncul) ---
|
| 36 |
+
try:
|
| 37 |
+
wait = WebDriverWait(driver, 10)
|
| 38 |
+
accept_button = wait.until(EC.element_to_be_clickable(
|
| 39 |
+
(By.XPATH, "//button[contains(text(), 'Accept All Cookies')]")
|
| 40 |
+
))
|
| 41 |
+
accept_button.click()
|
| 42 |
+
print("πͺ Cookie banner accepted.")
|
| 43 |
+
except TimeoutException:
|
| 44 |
+
print("No cookie banner found or it took too long.")
|
| 45 |
+
|
| 46 |
+
# --- Coba deteksi tabel TIM terlebih dahulu ---
|
| 47 |
+
table_html = None
|
| 48 |
+
try:
|
| 49 |
+
wait = WebDriverWait(driver, 15)
|
| 50 |
+
div_team = wait.until(EC.presence_of_element_located((By.ID, "all_stats_passing_team")))
|
| 51 |
+
print("β
Team passing table found.")
|
| 52 |
+
table_html = div_team.get_attribute("outerHTML")
|
| 53 |
+
table_type = "team"
|
| 54 |
+
except TimeoutException:
|
| 55 |
+
print("β οΈ Team passing table not found. Trying player table...")
|
| 56 |
+
|
| 57 |
+
# --- Fallback ke tabel pemain ---
|
| 58 |
+
try:
|
| 59 |
+
div_player = wait.until(EC.presence_of_element_located((By.ID, "all_stats_passing")))
|
| 60 |
+
print("β
Player passing table found.")
|
| 61 |
+
table_html = div_player.get_attribute("outerHTML")
|
| 62 |
+
table_type = "player"
|
| 63 |
+
except TimeoutException:
|
| 64 |
+
print("β No passing table found at all. Saving debug files...")
|
| 65 |
+
driver.save_screenshot('debug_screenshot.png')
|
| 66 |
+
with open('debug_page.html', 'w', encoding='utf-8') as f:
|
| 67 |
+
f.write(driver.page_source)
|
| 68 |
+
driver.quit()
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
driver.quit()
|
| 72 |
+
print("π Data downloaded. Processing with pandas...")
|
| 73 |
+
|
| 74 |
+
# --- Parse HTML table ke DataFrame ---
|
| 75 |
+
df = pd.read_html(StringIO(table_html))[0]
|
| 76 |
+
print(f"β
Table found with shape: {df.shape}")
|
| 77 |
+
|
| 78 |
+
# Gabungkan header dua baris (jika ada)
|
| 79 |
+
if isinstance(df.columns, pd.MultiIndex):
|
| 80 |
+
df.columns = ['_'.join(col).strip() for col in df.columns.values]
|
| 81 |
+
|
| 82 |
+
# Pilih kolom relevan
|
| 83 |
+
cols_to_use = [c for c in df.columns if any(x in c for x in ['Squad', 'Player', 'Cmp', 'Att', 'Cmp%', 'TotDist'])]
|
| 84 |
+
df = df[cols_to_use]
|
| 85 |
+
|
| 86 |
+
# Normalisasi nama kolom
|
| 87 |
+
rename_map = {}
|
| 88 |
+
for c in df.columns:
|
| 89 |
+
if 'Squad' in c: rename_map[c] = 'Squad'
|
| 90 |
+
elif 'Player' in c: rename_map[c] = 'Player'
|
| 91 |
+
elif 'Cmp%' in c: rename_map[c] = 'Total_Cmp%'
|
| 92 |
+
elif 'Cmp' in c and 'Cmp%' not in c: rename_map[c] = 'Total_Cmp'
|
| 93 |
+
elif 'Att' in c: rename_map[c] = 'Total_Att'
|
| 94 |
+
elif 'TotDist' in c: rename_map[c] = 'Total_TotDist'
|
| 95 |
+
df.rename(columns=rename_map, inplace=True)
|
| 96 |
+
|
| 97 |
+
# Bersihkan baris kosong / header duplikat
|
| 98 |
+
if 'Squad' in df.columns:
|
| 99 |
+
df = df[df['Squad'].notna()]
|
| 100 |
+
df = df[~df['Squad'].str.contains("Squad|Rk", na=False)]
|
| 101 |
+
|
| 102 |
+
print(f"β
Cleaned dataframe shape: {df.shape}")
|
| 103 |
+
return df, table_type
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def filter_teams(df, teams):
|
| 107 |
+
"""Filter baris berdasarkan nama tim"""
|
| 108 |
+
if "Squad" not in df.columns:
|
| 109 |
+
print("β οΈ 'Squad' column not found, skipping team filter.")
|
| 110 |
+
return df
|
| 111 |
+
return df[df["Squad"].isin(teams)]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def main():
|
| 115 |
+
df, table_type = pull_premier_league_passing()
|
| 116 |
+
if df is not None:
|
| 117 |
+
# Simpan hasil
|
| 118 |
+
filename = f"premier_league_{table_type}_passing.csv"
|
| 119 |
+
df.to_csv(filename, index=False)
|
| 120 |
+
print(f"\nπΎ Saved to {filename}")
|
| 121 |
+
|
| 122 |
+
# Filter contoh tim
|
| 123 |
+
teams = ["Arsenal", "Wolves", "Brighton"]
|
| 124 |
+
df_filtered = filter_teams(df, teams)
|
| 125 |
+
print(f"\nπ Passing Stats ({table_type.title()} Level) for selected teams")
|
| 126 |
+
print("=" * 80)
|
| 127 |
+
print(df_filtered.head())
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
main()
|
.history/historical_data_20251005104339.py
ADDED
|
File without changes
|
.history/historical_data_20251005104343.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from selenium import webdriver
|
| 3 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 4 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 5 |
+
from selenium.webdriver.common.by import By
|
| 6 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 7 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 8 |
+
from io import StringIO
|
| 9 |
+
import time
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
# --- FUNGSI UNTUK MENGHITUNG RATA-RATA PASSING % PER TIM ---
|
| 13 |
+
def calculate_team_passing_avg(passing_stats_file):
|
| 14 |
+
"""
|
| 15 |
+
Membaca file statistik passing pemain dan menghitung rata-rata
|
| 16 |
+
persentase passing ('Total_Cmp%') untuk setiap tim.
|
| 17 |
+
"""
|
| 18 |
+
try:
|
| 19 |
+
df_pass = pd.read_csv(passing_stats_file)
|
| 20 |
+
if "Squad" not in df_pass.columns or "Total_Cmp%" not in df_pass.columns:
|
| 21 |
+
print(f"β Error: Kolom 'Squad' atau 'Total_Cmp%' tidak ditemukan di {passing_stats_file}")
|
| 22 |
+
return None
|
| 23 |
+
|
| 24 |
+
# Mengubah tipe data dan menghitung rata-rata
|
| 25 |
+
df_pass['Total_Cmp%'] = pd.to_numeric(df_pass['Total_Cmp%'], errors='coerce')
|
| 26 |
+
team_avg_pass = df_pass.groupby('Squad')['Total_Cmp%'].mean().reset_index()
|
| 27 |
+
team_avg_pass.rename(columns={'Total_Cmp%': 'AvgPass%'}, inplace=True)
|
| 28 |
+
print("β
Berhasil menghitung rata-rata passing % per tim.")
|
| 29 |
+
return team_avg_pass
|
| 30 |
+
|
| 31 |
+
except FileNotFoundError:
|
| 32 |
+
print(f"β Error: File '{passing_stats_file}' tidak ditemukan.")
|
| 33 |
+
print(" Pastikan file ini ada di folder yang sama.")
|
| 34 |
+
return None
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"β Terjadi error saat memproses {passing_stats_file}: {e}")
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# --- FUNGSI UTAMA UNTUK SCRAPING DATA PERTANDINGAN ---
|
| 41 |
+
def scrape_historical_matches():
|
| 42 |
+
"""
|
| 43 |
+
Scrape data pertandingan historis dari FBref menggunakan Selenium.
|
| 44 |
+
"""
|
| 45 |
+
# URL untuk data Premier League musim 2023-2024 yang sudah selesai
|
| 46 |
+
url = "https://fbref.com/en/comps/9/schedule/2023-2024/Premier-League-Scores-and-Fixtures"
|
| 47 |
+
print(f"π Mengakses halaman: {url}")
|
| 48 |
+
|
| 49 |
+
options = webdriver.ChromeOptions()
|
| 50 |
+
options.add_argument("--headless") # Jalankan di background tanpa membuka browser
|
| 51 |
+
options.add_argument("--no-sandbox")
|
| 52 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 53 |
+
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
|
| 54 |
+
|
| 55 |
+
driver = None
|
| 56 |
+
try:
|
| 57 |
+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
| 58 |
+
driver.get(url)
|
| 59 |
+
|
| 60 |
+
# Coba klik cookie banner jika ada
|
| 61 |
+
try:
|
| 62 |
+
wait = WebDriverWait(driver, 5)
|
| 63 |
+
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept All"]')))
|
| 64 |
+
accept_button.click()
|
| 65 |
+
print("β
Cookie banner diterima.")
|
| 66 |
+
time.sleep(2)
|
| 67 |
+
except:
|
| 68 |
+
print("βΉοΈ Tidak ada cookie banner atau sudah diterima.")
|
| 69 |
+
|
| 70 |
+
# Ambil HTML dari tabel data pertandingan
|
| 71 |
+
try:
|
| 72 |
+
table_element = WebDriverWait(driver, 10).until(
|
| 73 |
+
EC.presence_of_element_located((By.ID, "sched_2023-2024_9_1"))
|
| 74 |
+
)
|
| 75 |
+
html_source = table_element.get_attribute('outerHTML')
|
| 76 |
+
print("β
Berhasil mengambil tabel data pertandingan.")
|
| 77 |
+
return html_source
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"β Gagal menemukan tabel pertandingan: {e}")
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
finally:
|
| 83 |
+
if driver:
|
| 84 |
+
driver.quit()
|
| 85 |
+
|
| 86 |
+
# --- MAIN SCRIPT ---
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
PASSING_STATS_FILE = "premier_league_player_passing.csv"
|
| 89 |
+
OUTPUT_FILE = "historical_matches.csv"
|
| 90 |
+
|
| 91 |
+
# 1. Hitung rata-rata passing dari file yang sudah ada
|
| 92 |
+
team_pass_avg_df = calculate_team_passing_avg(PASSING_STATS_FILE)
|
| 93 |
+
if team_pass_avg_df is None:
|
| 94 |
+
sys.exit()
|
| 95 |
+
|
| 96 |
+
# 2. Scrape data historis pertandingan
|
| 97 |
+
html_table = scrape_historical_matches()
|
| 98 |
+
if html_table is None:
|
| 99 |
+
sys.exit()
|
| 100 |
+
|
| 101 |
+
# 3. Proses data hasil scrape
|
| 102 |
+
print("βοΈ Memproses data pertandingan...")
|
| 103 |
+
df_matches = pd.read_html(StringIO(html_table))[0]
|
| 104 |
+
|
| 105 |
+
# Membersihkan data
|
| 106 |
+
df_matches = df_matches[['Date', 'Home', 'Score', 'Away']]
|
| 107 |
+
df_matches.dropna(subset=['Score'], inplace=True)
|
| 108 |
+
df_matches = df_matches[df_matches['Score'].str.contains('β', na=False)]
|
| 109 |
+
|
| 110 |
+
scores = df_matches['Score'].str.split('β', expand=True)
|
| 111 |
+
df_matches['HomeGoals'] = pd.to_numeric(scores[0])
|
| 112 |
+
df_matches['AwayGoals'] = pd.to_numeric(scores[1])
|
| 113 |
+
|
| 114 |
+
print("π Menggabungkan data pertandingan dengan data passing...")
|
| 115 |
+
|
| 116 |
+
# Buat dictionary untuk mapping nama tim ke passing %
|
| 117 |
+
pass_map = {row['Squad']: row['AvgPass%'] for index, row in team_pass_avg_df.iterrows()}
|
| 118 |
+
|
| 119 |
+
def get_pass_perc(team_name):
|
| 120 |
+
if team_name in pass_map:
|
| 121 |
+
return pass_map[team_name]
|
| 122 |
+
for squad_name, perc in pass_map.items():
|
| 123 |
+
if team_name in squad_name or squad_name in team_name:
|
| 124 |
+
return perc
|
| 125 |
+
return team_pass_avg_df['AvgPass%'].mean()
|
| 126 |
+
|
| 127 |
+
df_matches['HomePass%'] = df_matches['Home'].apply(get_pass_perc)
|
| 128 |
+
df_matches['AwayPass%'] = df_matches['Away'].apply(get_pass_perc)
|
| 129 |
+
|
| 130 |
+
# Finalisasi DataFrame
|
| 131 |
+
final_df = df_matches[['Date', 'Home', 'Away', 'HomeGoals', 'AwayGoals', 'HomePass%', 'AwayPass%']]
|
| 132 |
+
final_df = final_df.round(1)
|
| 133 |
+
|
| 134 |
+
# 4. Simpan ke CSV
|
| 135 |
+
try:
|
| 136 |
+
final_df.to_csv(OUTPUT_FILE, index=False)
|
| 137 |
+
print(f"\nπ SUKSES! File '{OUTPUT_FILE}' berhasil dibuat dengan {len(final_df)} data pertandingan.")
|
| 138 |
+
print(" Sekarang Anda bisa menjalankan script prediksi utama Anda.")
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"β Gagal menyimpan file CSV: {e}")
|