import pandas as pd
import plotly
import cufflinks as cf
cf.go_offline()
df = pd.read_csv('MFG10YearTerminationData.csv')
df.describe().drop(['EmployeeID','store_name', 'STATUS_YEAR'], axis=1)
df.head(10)
groupby_dept = df[df.STATUS == 'ACTIVE'].groupby('department_name')
(df[df.STATUS == 'ACTIVE'].groupby('department_name')['STATUS'].count()
/ df.groupby('department_name')['STATUS'].count()).sort_values().iplot(kind='bar', barmode='group')
df['age'].iplot(kind='histogram')
(df[df.STATUS == 'ACTIVE'].groupby('age')['STATUS'].count()
/ df.groupby('age')['STATUS'].count()).sort_values().iplot(kind='bar')
df_drop_columns = df.drop(['EmployeeID','gender_full','terminationdate_key','birthdate_key','orighiredate_key', 'recorddate_key'], axis=1)
cat_feats = ['city_name', 'department_name', 'job_title', 'store_name', 'gender_short', 'termreason_desc', 'termtype_desc', 'STATUS_YEAR', 'STATUS', 'BUSINESS_UNIT']
final_data = pd.get_dummies(df_drop_columns,columns=cat_feats,drop_first=True)
final_data.head()
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X = final_data.drop('STATUS_TERMINATED',axis=1)
y = final_data['STATUS_TERMINATED']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))