Module 6: Practice Sheet#
# Setup code; make sure to run this if using Binder or Colab
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'shared')))
import setup_code
stroke_data = setup_code.stroke_data
Part 1: Function Basics#
1.1 Define a Basic Function#
Write a function called describe_data()
that takes a DataFrame and returns its .describe()
output. Donāt forget to add a docstring to the function.
stroke_data = setup_code.stroke_data
def describe_data(df):
pass
# YOUR CODE HERE
Solution
def describe_data(df):
"""
Return descriptive statistics summary of the DataFrame.
Parameters
----------
df : pandas.DataFrame
The dataset to describe.
Returns
-------
pandas.DataFrame
Summary statistics including count, mean, std, min, quartiles, and max.
"""
return df.describe()
1.2 Add a Pass Statement#
Define a function called analyze_smoking_effect
that includes a pass statement. We will fill in the function body later.
# YOUR CODE HERE
Solution
def analyze_smoking_effect(df):
"""
Analyze the effect of smoking on stroke risk in the given DataFrame.
Parameters:
df (DataFrame): The DataFrame containing healthcare data.
Returns:
None
"""
pass # Placeholder for future implementation
Part 2: Return vs Print vs No Return#
2.1 Return Value#
Write a function that returns the number of patients in the dataset who have ever smoked.
# YOUR CODE HERE
Solution
def analyze_smoking_effect(df):
"""
Analyze the effect of smoking on stroke risk in the given DataFrame.
Parameters:
df (DataFrame): The DataFrame containing healthcare data. Must include
'smoking_status' and 'stroke' columns.
Returns:
smoking_rates (pandas.Series): A Series with smoking status as the index and average stroke rate as the values.
"""
# Average Stroke rate per smoking category
smoking_rates = df.groupby('smoking_status')['stroke'].mean()
return smoking_rates
2.2 Print Statement#
Modify the analyze_smoking_effect(df)
function to print a summary of smoking status counts and calculates stroke rates per smoking group.
# YOUR CODE HERE
š” Hints
The variable smoking_rates
is a Pandas Series, where:
- Each index is a smoking category (like
'smokes'
,'never smoked'
). - Each value is the average stroke rate for that category.
To print a nice summary:
- Loop through
smoking_rates.items()
to access both the category and the stroke rate. - Use
print(f"{category}: {rate:.2%}")
to format the output as percentages.
Solution
def analyze_smoking_effect(df):
"""
Analyze the effect of smoking on stroke risk in the given DataFrame.
Parameters:
df (DataFrame): The DataFrame containing healthcare data. Must include
'smoking_status' and 'stroke' columns.
Returns:
None
"""
counts = df['smoking_status'].value_counts()
total = len(df)
print("Smoking Status Counts:")
print(counts)
print(f"\nTotal records: {total}")
# Stroke rate per smoking category
smoking_rates = df.groupby('smoking_status')['stroke'].mean()
print("\nAverage stroke rate by smoking status:")
for status, rate in smoking_rates.items():
print(f"{status}: {rate:.2%}")
Part 3: Parameters vs Arguments#
3.1 Multiple Parameters#
Write a function that takes two parameters: a DataFrame and a column name. It should print the columnās value counts. You can optionally include a loop that checks whether the column exists in the DataFrame before printing its value counts.
# YOUR CODE HERE
Solution
def col_description(df, col_name):
"""
Prints the value counts of a specified column in the DataFrame.
Parameters:
df (DataFrame): The DataFrame containing the data.
col_name (str): The name of the column to analyze.
"""
if col_name in df.columns:
print(df[col_name].value_counts())
else:
print(f"Column '{col_name}' does not exist in the DataFrame.")
3.2 Default Parameters#
Write a function called flag_high_risk()
that filters the dataset for patients with stroke and other risk conditions, such as high glucose, hypertension, and optionally, smoking status. All filters should be optional using default parameters.
Hints
Set a default parameter like
threshold=140
Use a filter condition with:
df['avg_glucose_level'] > threshold
and
df['stroke'] == 1
Combine them with
&
(and wrap each condition in parentheses)Use
return
to send back the filtered DataFrame
Solution
def flag_high_risk(df, glucose_threshold=140, check_hypertension=True, smoking_status=None):
"""
Return a filtered DataFrame of stroke patients who meet one or more risk criteria.
Parameters:
df (DataFrame): The stroke dataset.
glucose_threshold (float): Minimum glucose level to be considered high risk. Default is 140.
check_hypertension (bool): Whether to include hypertension as a risk factor. Default is True.
smoking_status (str or None): If provided, filter for this smoking category only.
E.g., 'smokes', 'formerly smoked', 'never smoked'.
Returns:
DataFrame: Filtered DataFrame of high-risk stroke patients.
"""
condition = (df['stroke'] == 1) & (df['avg_glucose_level'] > glucose_threshold)
if check_hypertension:
condition &= (df['hypertension'] == 1)
if smoking_status is not None:
condition &= (df['smoking_status'] == smoking_status)
return df[condition]
## Default usage
# flag_high_risk(stroke_data).head()
## Include only smokers
# flag_high_risk(stroke_data, smoking_status='smokes').head()
## Use a higher glucose threshold and ignore hypertension
# flag_high_risk(stroke_data, glucose_threshold=160, check_hypertension=False).head()
3.3 Positional and keywords arguments#
Define a function called compare_histograms(df, *columns, **plot_options)
that allows users to plot histograms for multiple numeric columns in the dataset and customize the plot style with keyword arguments.
def compare_histograms(df, *columns, **plot_options):
#YOUR CODE HERE
pass
Hints
Hint 1: Loop through the column names
Use a for
loop to iterate over columns
, which is a tuple of all the column names passed positionally.
for col in columns:
...
Hint 2: Access optional arguments from **plot_options
Use .get()
to safely access values from the dictionary of keyword arguments. Set defaults like:
plot_options.get("bins", 30)
plot_options.get("alpha", 0.5)
This lets users override defaults, but still works even if they don't.
Hint 3: Customize your plot
Use the values from **plot_options
to set the title, figure size, etc. Examples:
plt.figure(figsize=plot_options.get("figsize", (6, 4)))
plt.title(plot_options.get("title", "Comparison"))
Hint 4: Donāt forget labels and legends
Use label=col
in your histogram so each column has a legend entry.
Solution
def compare_histograms(df, *columns, **plot_options):
"""
Plot histograms of multiple numeric columns from the DataFrame.
Parameters:
df : pd.DataFrame
The dataset to visualize.
*columns : str
Column names to plot.
**plot_options : dict
Keyword arguments for customizing the plot (e.g., bins, alpha, figsize).
Returns:
None
"""
num_cols = len(columns)
if num_cols == 0:
print("Please provide at least one column to plot.")
return
plt.figure(figsize=plot_options.get("figsize", (6, 4)))
for col in columns:
data = df[col].dropna()
plt.hist(
data,
bins=plot_options.get("bins", 30),
alpha=plot_options.get("alpha", 0.5),
label=col,
edgecolor='black'
)
plt.title(plot_options.get("title", "Comparison of Columns"))
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()
## Example usage:
# compare_histograms(stroke_data,'avg_glucose_level','bmi',bins=40,alpha=0.6,title="Glucose vs BMI Distribution",figsize=(8, 5))