# Import all required librariesimport pandas as pdimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltimport scipy.stats as statsfrom scipy.stats import skewnormfrom scipy.stats import kurtosis, normfrom scipy.stats import gammaimport missingno as msnoimport randomimport statsmodels.api as sm# Load in UK Smoking Databirths14 = pd.read_csv("data/births14.csv")# Set seedrandom.seed(123)
Exploratory Data Analysis
What is exploratory data analysis?
Exploratory Data Analysis is a statistical approach to analyzing datasets to summarize their main characteristics, often using visual methods.
# Select numerical columnsnumerical_cols = births14.select_dtypes(include = ['number']).columnsfor col in numerical_cols:# Find Q1, Q3, and interquartile range (IQR) for each column Q1 = births14[col].quantile(0.25) Q3 = births14[col].quantile(0.75) IQR = Q3 - Q1# Upper and lower bounds for each column lower_bound = Q1 -1.5* IQR upper_bound = Q3 +1.5* IQR# Filter out the outliers from the DataFrame births14_clean = births14[(births14[col] >= lower_bound) & (births14[col] <= upper_bound)]# Select numerical columnsnumerical_cols = births14.select_dtypes(include = ['number']).columnsfor col in numerical_cols:# Find Q1, Q3, and interquartile range (IQR) for each column Q1 = births14[col].quantile(0.25) Q3 = births14[col].quantile(0.75) IQR = Q3 - Q1# Upper and lower bounds for each column lower_bound = Q1 -1.5* IQR upper_bound = Q3 +1.5* IQR# Filter out the outliers from the DataFrame births14_clean = births14[(births14[col] >= lower_bound) & (births14[col] <= upper_bound)]# Select numerical columnsnumerical_cols = births14.select_dtypes(include = ['number']).columnsfor col in numerical_cols:# Find Q1, Q3, and interquartile range (IQR) for each column Q1 = births14[col].quantile(0.25) Q3 = births14[col].quantile(0.75) IQR = Q3 - Q1# Upper and lower bounds for each column lower_bound = Q1 -1.5* IQR upper_bound = Q3 +1.5* IQR# Filter out the outliers from the DataFrame births14_clean = births14[(births14[col] >= lower_bound) & (births14[col] <= upper_bound)]# Select numerical columnsnumerical_cols = births14.select_dtypes(include = ['number']).columnsfor col in numerical_cols:# Find Q1, Q3, and interquartile range (IQR) for each column Q1 = births14[col].quantile(0.25) Q3 = births14[col].quantile(0.75) IQR = Q3 - Q1# Upper and lower bounds for each column lower_bound = Q1 -1.5* IQR upper_bound = Q3 +1.5* IQR# Filter out the outliers from the DataFrame births14_clean = births14[(births14[col] >= lower_bound) & (births14[col] <= upper_bound)]# Select numerical columnsnumerical_cols = births14.select_dtypes(include = ['number']).columnsfor col in numerical_cols:# Find Q1, Q3, and interquartile range (IQR) for each column Q1 = births14[col].quantile(0.25) Q3 = births14[col].quantile(0.75) IQR = Q3 - Q1# Upper and lower bounds for each column lower_bound = Q1 -1.5* IQR upper_bound = Q3 +1.5* IQR# Filter out the outliers from the DataFrame births14_clean = births14[(births14[col] >= lower_bound) & (births14[col] <= upper_bound)]# Select numerical columnsnumerical_cols = births14.select_dtypes(include = ['number']).columnsfor col in numerical_cols:# Find Q1, Q3, and interquartile range (IQR) for each column Q1 = births14[col].quantile(0.25) Q3 = births14[col].quantile(0.75) IQR = Q3 - Q1# Upper and lower bounds for each column lower_bound = Q1 -1.5* IQR upper_bound = Q3 +1.5* IQR# Filter out the outliers from the DataFrame births14_clean = births14[(births14[col] >= lower_bound) & (births14[col] <= upper_bound)]
Why are there still outliers?
Missing values (NaN)
# Count missing values in each columnbirths14.isnull().sum()
for column in births14.select_dtypes(include=['object', 'category']).columns:print(f"{column}: {births14[column].unique()}")for column in births14.select_dtypes(include=['object', 'category']).columns:print(f"{column}: {births14[column].unique()}")for column in births14.select_dtypes(include=['object', 'category']).columns:print(f"{column}: {births14[column].unique()}")
Normality check
Checking if the data follows a normal distribution is a common step in EDA.
Normality check
Histogram: bell-shaped curve
Skewness: Close to 0 for symmetry; Kurtosis: Close to 3 for normal “tailedness.”
Sample Size: Larger samples are less sensitive to non-normality.
Empirical Rule: 68-95-99.7% rule (1, 2, and 3 st dev. of the mean).
# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Q-Q plotsm.qqplot(dataCopyFin.weight, line='s')plt.title('Newborn Weight Q-Q plot')plt.show()# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Q-Q plotsm.qqplot(dataCopyFin.weight, line='s')plt.title('Newborn Weight Q-Q plot')plt.show()# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Q-Q plotsm.qqplot(dataCopyFin.weight, line='s')plt.title('Newborn Weight Q-Q plot')plt.show()# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Q-Q plotsm.qqplot(dataCopyFin.weight, line='s')plt.title('Newborn Weight Q-Q plot')plt.show()# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Q-Q plotsm.qqplot(dataCopyFin.weight, line='s')plt.title('Newborn Weight Q-Q plot')plt.show()# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Q-Q plotsm.qqplot(dataCopyFin.weight, line='s')plt.title('Newborn Weight Q-Q plot')plt.show()# Make a copy of the data dataCopy = births14.copy()# Remove NAsdataCopyFin = dataCopy.dropna()# Q-Q plotsm.qqplot(dataCopyFin.weight, line='s')plt.title('Newborn Weight Q-Q plot')plt.show()
Negative-skew (left-tailed)
Conclusions
Always inspect your data first.
Visualize relationships and distributions.
Identify and handle outliers and missing values.
Check for normality and understand the distribution of your data.