- The model predicts everything as “no” (0)
- Because the data are imbalanced and most labels are actually “no,” the model still gets a deceptively high accuracy by doing almost nothing meaningful.
- I used SMOTE to balance the dataset by oversampling the minority class
- The model achieves a 65% accuracy, lower than before applying SMOTE.
- However, this is acceptable because the new model improved recall for the minority class & actually "learned" to predict instead of assigning 0 ("no") to all cases for the sake of high accuracy score.
- All of them are 0 because the model never predicts 1 (no true positives).
- Meanwhile:
For class 0, the metrics are high because everything is predicted as 0, and the vast majority of actual labels are 0.
- The new model correctly predicts 41% of the positive class (recall)
- Only 14% of all of the model's predicted positives are correct (precision).
=> With SMOTE, the model is now able to detect the minority class (higher recall than a model that predicts “0” always)
SELECT * FROM bank_schema.bank_targets;
-- Total number of rows
SELECT COUNT(age) FROM bank_schema.bank_targets;
-- Age distribution statistics
SELECT MIN(age) AS min_age,
MAX(age) AS max_age,
AVG(age) AS mean_age,
MAX(age) - MIN(age) AS range_age
FROM bank_schema.bank_targets;
-- Occupation distribution statistics
SELECT job,
COUNT(job) AS number_of_people_with_this_job
FROM bank_schema.bank_targets
GROUP BY job
ORDER BY COUNT(job) DESC;
-- Retrieve the number of unique occupations
SELECT COUNT(DISTINCT job)
FROM bank_schema.bank_targets;
-- Calculate percentages of each occupation
SELECT (969 / COUNT(*)) * 100,
(946 / COUNT(*)) * 100,
(768 / COUNT(*)) * 100,
(478 / COUNT(*)) * 100,
(417 / COUNT(*)) * 100,
(230 / COUNT(*)) * 100,
(183 / COUNT(*)) * 100,
(168 / COUNT(*)) * 100,
(128 / COUNT(*)) * 100,
(112 / COUNT(*)) * 100,
(84 / COUNT(*)) * 100,
(38 / COUNT(*)) * 100
FROM bank_schema.bank_targets;
-- Marital status statistics
SELECT marital,
COUNT(marital) AS number_of_people
FROM bank_schema.bank_targets
GROUP BY marital;
-- Calculate percentages of customers in each marital status
SELECT (2797 / COUNT(*)) * 100,
(1196 / COUNT(*)) * 100,
(528 / COUNT(*)) * 100
FROM bank_schema.bank_targets;
-- Education statistics
UPDATE bank_schema.bank_targets
SET education = 'secondary'
WHERE education = 'unknown';
SELECT education,
COUNT(education) AS number_of_people
FROM bank_schema.bank_targets
GROUP BY education;
-- Calculate percentages of customers in each education level
SELECT (678 / COUNT(*)) * 100,
(2493 / COUNT(*)) * 100,
(1350 / COUNT(*)) * 100
FROM bank_schema.bank_targets;
-- Default rate statistics
SELECT default_rate,
COUNT(default_rate) AS number_of_people
FROM bank_schema.bank_targets
GROUP BY default_rate;
-- Calculate percentages of customers for each default rate
SELECT (4445 / COUNT(*)) * 100 AS `No_%`,
(76 / COUNT(*)) * 100 AS `Yes_%`
FROM bank_schema.bank_targets;
-- Balance statistics
SELECT MIN(balance) AS min,
MAX(balance) AS max,
AVG(balance) AS mean,
(MAX(balance) - MIN(balance)) AS `range`
FROM bank_schema.bank_targets;
-- Housing statistics
SELECT housing,
COUNT(housing) AS num_people
FROM bank_schema.bank_targets
GROUP BY housing;
-- Calculate percentages of customers for each housing category
SELECT (1962 / COUNT(*)) * 100 AS `No_%`,
(2559 / COUNT(*)) * 100 AS `Yes_%`
FROM bank_schema.bank_targets;
-- Loan statistics
SELECT loan,
COUNT(loan) AS num_loan
FROM bank_schema.bank_targets
GROUP BY loan;
-- Calculate percentages of customers for each loan category
SELECT (3830 / COUNT(*)) * 100 AS `No_%`,
(691 / COUNT(*)) * 100 AS `Yes_%`
FROM bank_schema.bank_targets;
-- Last contact month analysis
SELECT `month`,
COUNT(`month`)
FROM bank_schema.bank_targets
GROUP BY `month`
ORDER BY COUNT(`month`);
-- Calculate percentages of customers for each month
SELECT (20 / COUNT(*)) * 100 AS december,
(49 / COUNT(*)) * 100 AS march,
(52 / COUNT(*)) * 100 AS september,
(80 / COUNT(*)) * 100 AS october,
(148 / COUNT(*)) * 100 AS january,
(222 / COUNT(*)) * 100 AS february,
(293 / COUNT(*)) * 100 AS april,
(389 / COUNT(*)) * 100 AS november,
(531 / COUNT(*)) * 100 AS june,
(633 / COUNT(*)) * 100 AS august,
(706 / COUNT(*)) * 100 AS july,
(1398 / COUNT(*)) * 100 AS may
FROM bank_schema.bank_targets;
-- Duration of phone call analysis
SELECT MIN(duration) AS min_duration,
MAX(duration) AS max_duration,
AVG(duration) AS average_duration,
MAX(duration) - MIN(duration) AS range_duration
FROM bank_schema.bank_targets;
-- Number of contacts performed for each client
SELECT MIN(campaign) AS min_contacts,
MAX(campaign) AS max_contacts,
AVG(campaign) AS average_contacts
FROM bank_schema.bank_targets;
-- Number of days that passed by after the client
was last contacted from a previous campaign
-- ALTER TABLE bank_schema.bank_targets
RENAME COLUMN pdays TO days_passed;
SELECT COUNT(days_passed)
FROM bank_schema.bank_targets
WHERE days_passed = -1;
SELECT AVG(days_passed)
FROM bank_schema.bank_targets
WHERE days_passed != -1;
-- Number of contacts performed
before this campaign for each customer
-- ALTER TABLE bank_schema.bank_targets
RENAME COLUMN previous TO previous_contacts;
SELECT AVG(previous_contacts)
FROM bank_schema.bank_targets;
SELECT previous_contacts,
COUNT(previous_contacts)
FROM bank_schema.bank_targets
GROUP BY previous_contacts
ORDER BY COUNT(previous_contacts) DESC;
-- Outcome of the previous campaign
-- ALTER TABLE bank_schema.bank_targets
RENAME COLUMN poutcome TO previous_campaign_outcome;
SELECT previous_campaign_outcome,
COUNT(previous_campaign_outcome) AS count
FROM bank_schema.bank_targets
GROUP BY previous_campaign_outcome
ORDER BY COUNT(previous_campaign_outcome);
-- Calculate percentages
SELECT (129 / COUNT(*)) * 100 AS success,
(490 / COUNT(*)) * 100 AS failure,
(197 / COUNT(*)) * 100 AS other,
(3705 / COUNT(*)) * 100 AS `unknown`
FROM bank_schema.bank_targets;
-- Customers' responses to term deposit
SELECT y,
COUNT(y) AS number_of_customers
FROM bank_schema.bank_targets
GROUP BY y;
-- Housing ownership
SELECT housing,
COUNT(housing)
FROM bank_schema.bank_targets
WHERE y = 'yes'
GROUP BY housing;
SELECT (301 / 521) * 100 AS `no`,
(220 / 521) * 100 AS `yes`;
SELECT (93 / 521) * 100 AS may,
(61 / 521) * 100 AS july,
(79 / 521) * 100 AS aug,
(55 / 521) * 100 AS june,
(39 / 521) * 100 AS november,
(56 / 521) * 100 AS april,
(38 / 521) * 100 AS february,
(16 / 521) * 100 AS january,
(37 / 521) * 100 AS october,
(17 / 521) * 100 AS september,
(21 / 521) * 100 AS march,
(9 / 521) * 100 AS december;
-- Analyzing customers who responded "Yes" to term deposit
-- Age distribution
SELECT MIN(age),
MAX(age),
AVG(age),
MAX(age) - MIN(age) AS `range`
FROM bank_schema.bank_targets
WHERE y = 'yes';
-- Occupation
SELECT COUNT(DISTINCT job)
FROM bank_schema.bank_targets
WHERE y = 'yes';
SELECT job,
COUNT(job)
FROM bank_schema.bank_targets
WHERE y = 'yes'
GROUP BY job
ORDER BY COUNT(job) DESC;
SELECT (131 / 521) * 100 AS management,
(83 / 521) * 100 AS blue_collar,
(69 / 521) * 100 AS technician,
(58 / 521) * 100 AS administration,
(54 / 521) * 100 AS services,
(38 / 521) * 100 AS retired,
(20 / 521) * 100 AS sel_employed,
(19 / 521) * 100 AS entrepreneur,
(15 / 521) * 100 AS housemaid,
(14 / 521) * 100 AS student,
(10 / 521) * 100 AS unemployed,
(11 / 521) * 100 AS unknown;
| Information regarding the entire dataset | Customers who responded "Yes" to the term deposit | Customers who responded "Yes" to the term deposit.1 | |
|---|---|---|---|
| Total number | 4521 | 521 | 4000 |
| ------------------- | --------------------------------------------------------- | Age distribution------------------------------------------------ | -------------------------------------------------------------------------- |
| Min | 19 | 19 | 19 |
| Max | 87 | 87 | 86 |
| Range | 41 | 42 | 41 |
| Average | 68 | 68 | 67 |
| ------------------- | --------------------------------------------------------- | Occupation distribution-------------------------------------- | -------------------------------------------------------------------------- |
| Number of unique jobs | 12 | 12 | 12 |
| Management | 969 (21.43%) | 131 (25%) | 838 (21%) |
| Blue-collar | 946 (21%) | 83 (16%) | 877 (22%) |
| Technician | 768 (17%) | 69 (13%) | 685 (17%) |
| Admin | 478 (10.57%) | 58 (11%) | 420 (11%) |
| Services | 417 (9.22%) | 54 (10%) | 379 (9%) |
| Retired | 230 (5.09%) | 38 (7%) | 176 (4%) |
| Self-employed | 183 (4.05%) | 20 (4%) | 163 (4%) |
| Entrepreneur | 168 (3.72%) | 19 (4%) | 153 (4%) |
| Unemployed | 128 (2.83%) | 15 (3%) | 115 (3%) |
| Housemaid | 112 (2.48%) | 14 (3%) | 98 (2%) |
| Students | 84 (1.86%) | 13 (2%) | 65 (2%) |
| Unknown | 38 (0.84%) | 7 (1%) | 31 (1%) |
| ------------------- | --------------------------------------------------------- | Marital Status--------------------------------------------------- | -------------------------------------------------------------------------- |
| Married | 2797 (62%) | 277 (53%) | 2520 (63%) |
| Single | 1196 (26%) | 167 (32%) | 1029 (26%) |
| Divorced | 528 (12%) | 77 (15%) | 451 (11%) |
| ------------------- | --------------------------------------------------------- | Education--------------------------------------------------------- | -------------------------------------------------------------------------- |
| Primary | 678 (15%) | 64 (12%) | 614 (15%) |
| Secondary | 2493 (55%) | 264 (51%) | 2229 (56%) |
| Tertiary | 1350 (30%) | 193 (37%) | 1157 (29%) |
| ------------------- | --------------------------------------------------------- | Default rate-------------------------------------------------------- | -------------------------------------------------------------------------- |
| No | 4445 (98%) | 512 (98%) | 3933 (98%) |
| Yes | 76 (2%) | 9 (2%) | 67 (2%) |
| ------------------- | --------------------------------------------------------- | Balance------------------------------------------------------------- | -------------------------------------------------------------------------- |
| Min | -3313 | -1206 | -3313 |
| Max | 71188 | 26965 | 71188 |
| Mean | 1423 | 158 | 1403 |
| Range | 74501 | 28171 | 74501 |
| ------------------- | --------------------------------------------------------- | Housing Ownership----------------------------------------- | -------------------------------------------------------------------------- |
| No | 1962 (43%) | 301 (58%) | 1661 (42%) |
| Yes | 2559 (57%) | 220 (42%) | 2339 (58%) |
| ------------------- | --------------------------------------------------------- | Loan------------------------------------------------------------- | -------------------------------------------------------------------------- |
| No | 3830 (85%) | 478 (92%) | 3352 (84%) |
| Yes | 691 (15%) | 43 (8%) | 648 (16%) |
| ------------------- | --------------------------------------------------------- | Month of previous contact---------------------------------- | -------------------------------------------------------------------------- |
| May | 1398 (31%) | 93 (18%) | 1305 (3%) |
| July | 706 (16%) | 61 (12%) | 645 (16%) |
| August | 633 (14%) | 79 (15%) | 554 (14%) |
| June | 531 (12%) | 55 (11%) | 476 (12%) |
| November | 389 (9%) | 39(7%) | 350 (9%) |
| April | 293 (6%) | 56 (11%) | 237 (6%) |
| February | 222 (5%) | 38 (7%) | 184 (5%) |
| January | 148 (3%) | 16 (3%) | 132 (3%) |
| October | 80 (2%) | 37 (7%) | 43 (1%) |
| September | 52 (1%) | 17 (3%) | 35 (1%) |
| March | 49 (1%) | 21 (4%) | 28 (1%) |
| December | 20 (0.4%) | 9 (1%) | 11 (0.3%) |
| ------------------- | --------------------------------------------------------- | Duration of phone call---------------------------------------- | -------------------------------------------------------------------------- |
| Min | 4 seconds | 30 seconds | 4 seconds |
| Max | 3025 seconds | 2769 seconds | 3025 seconds |
| Average | 264 seconds | 553 seconds | 226 seconds |
| Range | 301 seconds | 2739 seconds | 3021 seconds |
| ------------------- | --------------------------------------------------------- | Number of contacts------------------------------------------- | -------------------------------------------------------------------------- |
| Min | 1 time | 1 | 1 |
| Max | 50 times | 24 | 50 |
| Average | 3 times | 2 | 3 |
| ------------------- | --------------------------------------------------------- | Number of days passed by after last contact------------ | -------------------------------------------------------------------------- |
| Average | 225 days | 196 days | 233 days |
| Not previously contacted | 3705 customers | 337 customers | 3368 customers |
| ------------------- | --------------------------------------------------------- | Number of contacts performed before this campaign | -------------------------------------------------------------------------- |
| 0 times | 3705 customers | 337 customers | 3368 customers |
| Average | 1 time | 3 times | 3 times |
| ------------------- | --------------------------------------------------------- | Outcome of the previous campaign------------------------ | -------------------------------------------------------------------------- |
| Success | 129 (3%) | 83 (16%) | 46 (1%) |
| Failure | 490 (11%) | 63 (12%) | 427 (11%) |
| Other | 197 (5%) | 38 (7%) | 159 (4%) |
| Unknown | 3705 (82%) | 337 (65%) | 3368 (84%) |