# clone the project repository, change to right directory, and import libraries.
%cd /content
!git clone https://github.com/EllaMoses/EllaMoses.github.io.git
%cd /content/EllaMoses.github.io/data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

/content
Cloning into 'EllaMoses.github.io'...
remote: Enumerating objects: 54, done.
remote: Counting objects: 100% (54/54), done.
remote: Compressing objects: 100% (47/47), done.
remote: Total 54 (delta 19), reused 3 (delta 1), pack-reused 0
Receiving objects: 100% (54/54), 2.06 MiB | 2.51 MiB/s, done.
Resolving deltas: 100% (19/19), done.
/content/EllaMoses.github.io/data


#Read in Play List file
plays = pd.read_csv("PlayList.csv")
plays


# data types of variables in plays dataframe
plays.dtypes

PlayerKey          int64
GameID            object
PlayKey           object
RosterPosition    object
PlayerDay          int64
PlayerGame         int64
StadiumType       object
FieldType         object
Temperature        int64
Weather           object
PlayType          object
PlayerGamePlay     int64
Position          object
PositionGroup     object
dtype: object


# summary statistics for plays dataframe
plays.describe()


#Read in injury file
injury = pd.read_csv("InjuryRecord.csv")
injury


# data types of variables in injury dataframe
injury.dtypes

PlayerKey     int64
GameID       object
PlayKey      object
BodyPart     object
Surface      object
DM_M1         int64
DM_M7         int64
DM_M28        int64
DM_M42        int64
dtype: object


# summary statistics for injury data frame
injury.describe()


#Combine the two dataframes by performing a right merge to add all relevent info from plays data frame to the injury data frame

injury_plays = injury.merge(plays, on=["PlayerKey", "GameID", "PlayKey"], how="left")

#we can use .replace to fix all -999 values with NaN
injury_plays['Temperature'] = injury_plays['Temperature'].replace(-999,np.nan)

# map weather conditions to account for typos and other name variations
weathermapped = injury_plays['Weather'].map({
  "Clear": "Clear",
  "Clear Skies": "Clear",
  "Clear and warm":"Clear",
  "Clear skies":"Clear",
  "Cloudy":"Cloudy",
  "Cloudy and Cool":"Cloudy",
  "Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.":"Rain",
  "Cloudy, 50% chnace of rain":"Rain",
  "Cold":"NaN",
  "Controlled Climate":"Indoor",
  "Coudy":"Cloudy",
  "Fair":"Partly Cloudy",
  "Indoor":"Indoor",
  "Indoors":"Indoor",
  "Light Rain":"Rain",
  "Mostly Sunny":"Partly Cloudy",
  "Mostly cloudy":"Partly Cloudy",
  "Mostly sunny":"Partly Cloudy",
  "Partly Cloudy":"Partly Cloudy",
  "Rain":"Rain",
  "Rain shower":"Rain",
  "Sun & clouds":"Partly Cloudy",
  "Sunny":"Clear",
})
injury_plays['Weather'] = weathermapped

# Create a new variable to store the number of days missed
def num_days_missed(row):
    for i in [42, 28, 7, 1]:
        if row[f'DM_M{i}'] == 1:
            return i
    return 0

# Apply the function to each row to get the total days missed
injury_plays['total_days_missed'] = injury_plays.apply(num_days_missed, axis=1)

injury_plays


bodypartgroup = injury.groupby('BodyPart').count()
bodypartgroup['PlayerKey'].plot.bar()

<Axes: xlabel='BodyPart'>


injury_plays['total_days_missed'].value_counts().sort_index().plot.bar()

<Axes: xlabel='total_days_missed'>


pd.crosstab(index=injury_plays["total_days_missed"],
                        columns=injury_plays["BodyPart"],normalize=True)


sns.heatmap( pd.crosstab(injury_plays['total_days_missed'], injury_plays['BodyPart'], normalize=True), cmap='Purples')

<Axes: xlabel='BodyPart', ylabel='total_days_missed'>


pd.crosstab(index=injury_plays["Weather"],
                        columns=injury_plays["BodyPart"])


sns.heatmap( pd.crosstab(injury_plays['BodyPart'], injury_plays['Weather'], normalize=True), cmap='Purples')

<Axes: xlabel='Weather', ylabel='BodyPart'>


pd.crosstab(index=injury_plays["Weather"],
                        columns=injury_plays["total_days_missed"])


sns.heatmap( pd.crosstab(injury_plays['total_days_missed'], injury_plays['Weather'], normalize=True), cmap='Purples')

<Axes: xlabel='Weather', ylabel='total_days_missed'>


pd.crosstab(index=injury_plays["Surface"],
                        columns=injury_plays["BodyPart"],normalize=True)


sns.heatmap( pd.crosstab(injury_plays['BodyPart'], injury_plays['Surface'],normalize=True), cmap='Purples')

<Axes: xlabel='Surface', ylabel='BodyPart'>


pd.crosstab(index=injury_plays["Surface"],
                        columns=injury_plays["total_days_missed"])


sns.heatmap( pd.crosstab(injury_plays['total_days_missed'], injury_plays['Surface'], normalize=True), cmap='Purples')

<Axes: xlabel='Surface', ylabel='total_days_missed'>


pd.crosstab(index=injury_plays["PlayType"],
                        columns=injury_plays["BodyPart"], normalize=True)


sns.heatmap( pd.crosstab(injury_plays['BodyPart'], injury_plays['PlayType'], normalize=True), cmap='Purples')

<Axes: xlabel='PlayType', ylabel='BodyPart'>


pd.crosstab(index=injury_plays["PlayType"],
                        columns=injury_plays["total_days_missed"])


sns.heatmap( pd.crosstab(injury_plays['total_days_missed'], injury_plays['PlayType'], normalize=True), cmap='Purples')

<Axes: xlabel='PlayType', ylabel='total_days_missed'>


pd.crosstab(index=injury_plays["Position"],
                        columns=injury_plays["BodyPart"],normalize=True)


sns.heatmap( pd.crosstab(injury_plays['BodyPart'], injury_plays['Position'],normalize=True), cmap='Purples')

<Axes: xlabel='Position', ylabel='BodyPart'>


pd.crosstab(index=injury_plays["Position"],
                        columns=injury_plays["total_days_missed"],normalize=True)


sns.heatmap( pd.crosstab(injury_plays['total_days_missed'], injury_plays['Position'],normalize=True), cmap='Purples')

<Axes: xlabel='Position', ylabel='total_days_missed'>


import warnings
warnings.filterwarnings('ignore')
injury_plays = injury_plays.dropna()
injury_plays["string_days_missed"] = injury_plays["total_days_missed"].astype(str)
injury_plays


# KNN to predict body part
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, recall_score, precision_score
from sklearn.pipeline import Pipeline

# define the training data
X_train = injury_plays[["Weather", "Position", "PlayType", "FieldType"]].to_dict(orient="records")
y_train = injury_plays["BodyPart"]

#need to do dummy data here
vec = DictVectorizer(sparse=False)
vec.fit(X_train)
X_train = vec.transform(X_train)


# standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

# fit the k-nearest neighbors model
model = KNeighborsClassifier(n_neighbors=7)
model.fit(X_train_sc, y_train)

pipeline = Pipeline([
    ("scaler", scaler),
    ("model", model)
])
cross_val_score(pipeline, X_train, y_train,
                cv=3, scoring="accuracy").mean()

0.5396825396825397


# logistic regression to predict body part
from sklearn.linear_model import LogisticRegression

# define the training data
X_train = injury_plays[["Weather", "Position", "PlayType", "FieldType"]].to_dict(orient="records")
y_train = injury_plays["BodyPart"]

#need to do dummy data here
vec = DictVectorizer(sparse=False)
vec.fit(X_train)
X_train = vec.transform(X_train)


# standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

# fit a logistic regression model
model = LogisticRegression()
model.fit(X_train_sc, y_train)
pipeline = Pipeline([
    ("scaler", scaler),
    ("model", model)
])
cross_val_score(pipeline, X_train, y_train,
                cv=3, scoring="accuracy").mean()

0.5396825396825397


# KNN for days missed
# define the training data
X_train = injury_plays[["Weather", "Position", "PlayType", "FieldType"]].to_dict(orient="records")
y_train = injury_plays["string_days_missed"]

#need to do dummy data here
vec = DictVectorizer(sparse=False)
vec.fit(X_train)
X_train = vec.transform(X_train)


# standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

# fit the k-nearest neighbors model, using classifier because string_days_missed is an object
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_sc, y_train)
pipeline = Pipeline([
    ("scaler", scaler),
    ("model", model)
])
cross_val_score(pipeline, X_train, y_train,
                cv=5, scoring="accuracy").mean()

0.43076923076923085


# logistic regression for days missed

# define the training data
X_train = injury_plays[["Weather", "Position", "PlayType", "FieldType"]].to_dict(orient="records")
y_train = injury_plays["string_days_missed"]

#need to do dummy data here
vec = DictVectorizer(sparse=False)
vec.fit(X_train)
X_train = vec.transform(X_train)


# standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

# fit a logistic model
model = LogisticRegression()
model.fit(X_train_sc, y_train)
pipeline = Pipeline([
    ("scaler", scaler),
    ("model", model)
])
cross_val_score(pipeline, X_train, y_train,
                cv=8, scoring="accuracy").mean()

0.4419642857142857


# convert notebook to html file, must first download this notebook and add it to the content folder
%%shell
jupyter nbconvert --to html /content/KailenElla.ipynb

	PlayerKey	GameID	PlayKey	RosterPosition	PlayerDay	PlayerGame	StadiumType	FieldType	Temperature	Weather	PlayType	PlayerGamePlay	Position	PositionGroup
0	26624	26624-1	26624-1-1	Quarterback	1	1	Outdoor	Synthetic	63	Clear and warm	Pass	1	QB	QB
1	26624	26624-1	26624-1-2	Quarterback	1	1	Outdoor	Synthetic	63	Clear and warm	Pass	2	QB	QB
2	26624	26624-1	26624-1-3	Quarterback	1	1	Outdoor	Synthetic	63	Clear and warm	Rush	3	QB	QB
3	26624	26624-1	26624-1-4	Quarterback	1	1	Outdoor	Synthetic	63	Clear and warm	Rush	4	QB	QB
4	26624	26624-1	26624-1-5	Quarterback	1	1	Outdoor	Synthetic	63	Clear and warm	Pass	5	QB	QB
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
267000	47888	47888-13	47888-13-51	Cornerback	99	13	Outdoor	Synthetic	33	Sunny and cold	Pass	51	DB	DB
267001	47888	47888-13	47888-13-52	Cornerback	99	13	Outdoor	Synthetic	33	Sunny and cold	Pass	52	DB	DB
267002	47888	47888-13	47888-13-53	Cornerback	99	13	Outdoor	Synthetic	33	Sunny and cold	Pass	53	DB	DB
267003	47888	47888-13	47888-13-54	Cornerback	99	13	Outdoor	Synthetic	33	Sunny and cold	Pass	54	DB	DB
267004	47888	47888-13	47888-13-55	Cornerback	99	13	Outdoor	Synthetic	33	Sunny and cold	Rush	55	DB	DB

	PlayerKey	PlayerDay	PlayerGame	Temperature	PlayerGamePlay
count	267005.000000	267005.000000	267005.000000	267005.000000	267005.000000
mean	41515.381465	210.451351	13.799131	-35.029535	29.058647
std	4125.858924	183.643654	8.342894	304.583110	19.626551
min	26624.000000	-62.000000	1.000000	-999.000000	1.000000
25%	39653.000000	43.000000	7.000000	44.000000	13.000000
50%	42432.000000	102.000000	13.000000	61.000000	26.000000
75%	44480.000000	400.000000	20.000000	72.000000	43.000000
max	47888.000000	480.000000	32.000000	97.000000	102.000000

	PlayerKey	GameID	PlayKey	BodyPart	Surface	DM_M1	DM_M7	DM_M28	DM_M42
0	39873	39873-4	39873-4-32	Knee	Synthetic	1	1	1	1
1	46074	46074-7	46074-7-26	Knee	Natural	1	1	0	0
2	36557	36557-1	36557-1-70	Ankle	Synthetic	1	1	1	1
3	46646	46646-3	46646-3-30	Ankle	Natural	1	0	0	0
4	43532	43532-5	43532-5-69	Ankle	Synthetic	1	1	1	1
...	...	...	...	...	...	...	...	...	...
100	44423	44423-13	NaN	Knee	Synthetic	1	0	0	0
101	31933	31933-20	NaN	Knee	Synthetic	1	0	0	0
102	47285	47285-4	NaN	Knee	Natural	1	1	0	0
103	37068	37068-19	NaN	Knee	Natural	1	1	0	0
104	36696	36696-24	NaN	Knee	Synthetic	1	1	0	0

	PlayerKey	DM_M1	DM_M7	DM_M28	DM_M42
count	105.000000	105.0	105.000000	105.000000	105.000000
mean	42283.609524	1.0	0.723810	0.352381	0.276190
std	4163.510366	0.0	0.449257	0.480003	0.449257
min	31070.000000	1.0	0.000000	0.000000	0.000000
25%	39656.000000	1.0	0.000000	0.000000	0.000000
50%	43518.000000	1.0	1.000000	0.000000	0.000000
75%	45966.000000	1.0	1.000000	1.000000	1.000000
max	47813.000000	1.0	1.000000	1.000000	1.000000

BodyPart	Ankle	Foot	Heel	Knee	Toes
total_days_missed
1	0.152381	0.000000	0.000000	0.104762	0.019048
7	0.123810	0.000000	0.009524	0.200000	0.038095
28	0.019048	0.019048	0.000000	0.028571	0.009524
42	0.104762	0.047619	0.000000	0.123810	0.000000

Project Description¶

Project Goals¶

Discussion:¶

Data Processing¶

Exploratory Analysis & Data Visualization¶

Model¶

Conclusions¶

BodyPart	Ankle	Foot	Knee
Weather
Clear	13	0	9
Cloudy	6	2	7
Indoor	4	0	5
NaN	1	1	1
Partly Cloudy	8	1	7
Rain	2	0	5

BodyPart	Ankle	Foot	Heel	Knee	Toes
Surface
Natural	0.161905	0.047619	0.009524	0.228571	0.009524
Synthetic	0.238095	0.019048	0.000000	0.228571	0.057143

BodyPart	Ankle	Foot	Knee
PlayType
Kickoff	0.012987	0.012987	0.064935
Kickoff Not Returned	0.000000	0.000000	0.012987
Kickoff Returned	0.000000	0.000000	0.012987
Pass	0.194805	0.038961	0.181818
Punt	0.038961	0.000000	0.077922
Punt Not Returned	0.000000	0.012987	0.000000
Punt Returned	0.025974	0.000000	0.012987
Rush	0.181818	0.012987	0.103896