# ============================================================
# 0) Library Installation
# ============================================================
!pip -q install --user numpy pandas scikit-learn matplotlib seaborn xgboost imbalanced-learn

# ============================================================
# 1) Imports and Configuration
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

try:
    from xgboost import XGBClassifier
    XGB_OK = True
except Exception:
    XGB_OK = False

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
sns.set_theme(style="whitegrid", context="notebook")

# ============================================================
# 2) Data Loading
# ============================================================
URL = "https://raw.githubusercontent.com/EvagAIML/Case-Study-V3/refs/heads/main/EasyVisa%20(13).csv"
df = pd.read_csv(URL)

print("Shape:", df.shape)
display(df.head())
print("\nFirst 5 rows:")
display(df.head())
print("\nLast 5 rows:")
display(df.tail())

Shape: (25480, 12)

First 5 rows:

Last 5 rows:

# ============================================================
# 3) Data Overview
# ============================================================
print("Data Types:\n", df.dtypes)
print("\nMissing Values per Column:")
display(df.isna().sum().sort_values(ascending=False))

print("\nTarget Distribution:")
print(df["case_status"].value_counts())

print("\nNumeric Summary:")
display(df.select_dtypes(include="number").describe().T)

print("\nSample Categorical Values:")
for col in df.select_dtypes(include="object").columns:
    print(f"\n{col}:\n", df[col].value_counts().head(5))

Data Types:
 case_id                   object
continent                 object
education_of_employee     object
has_job_experience        object
requires_job_training     object
no_of_employees            int64
yr_of_estab                int64
region_of_employment      object
prevailing_wage          float64
unit_of_wage              object
full_time_position        object
case_status               object
dtype: object

Missing Values per Column:

Target Distribution:
case_status
Certified    17018
Denied        8462
Name: count, dtype: int64

Numeric Summary:

Sample Categorical Values:

case_id:
 case_id
EZYV25480    1
EZYV01       1
EZYV02       1
EZYV03       1
EZYV04       1
Name: count, dtype: int64

continent:
 continent
Asia             16861
Europe            3732
North America     3292
South America      852
Africa             551
Name: count, dtype: int64

education_of_employee:
 education_of_employee
Bachelor's     10234
Master's        9634
High School     3420
Doctorate       2192
Name: count, dtype: int64

has_job_experience:
 has_job_experience
Y    14802
N    10678
Name: count, dtype: int64

requires_job_training:
 requires_job_training
N    22525
Y     2955
Name: count, dtype: int64

region_of_employment:
 region_of_employment
Northeast    7195
South        7017
West         6586
Midwest      4307
Island        375
Name: count, dtype: int64

unit_of_wage:
 unit_of_wage
Year     22962
Hour      2157
Week       272
Month       89
Name: count, dtype: int64

full_time_position:
 full_time_position
Y    22773
N     2707
Name: count, dtype: int64

case_status:
 case_status
Certified    17018
Denied        8462
Name: count, dtype: int64

# ============================================================
# 4) Data Cleaning and Feature Preparation
# ============================================================
df_clean = df.copy()

def normalize_yn(x):
    s = str(x).strip().upper()
    if s in {"Y","YES","TRUE","T","1"}: return "YES"
    if s in {"N","NO","FALSE","F","0"}: return "NO"
    return s

for col in ["has_job_experience", "requires_job_training", "full_time_position"]:
    df_clean[col] = df_clean[col].map(normalize_yn)

for col in df_clean.select_dtypes(include="object").columns:
    df_clean[col] = df_clean[col].astype(str).str.strip().str.upper()

if "no_of_employees" in df_clean.columns:
    emp = pd.to_numeric(df_clean["no_of_employees"], errors="coerce")
    neg = emp < 0
    if neg.sum() > 0:
        df_clean.loc[neg, "no_of_employees"] = emp.median()
        print(f"Negative values corrected in 'no_of_employees': {neg.sum()}")

if {"prevailing_wage", "unit_of_wage"}.issubset(df_clean.columns):
    factors = {"HOUR":2080, "WEEK":52, "MONTH":12, "YEAR":1}
    df_clean["unit_of_wage"] = df_clean["unit_of_wage"].astype(str).str.upper().str.strip()
    df_clean["prevailing_wage_yearly"] = df_clean.apply(
        lambda r: pd.to_numeric(r["prevailing_wage"], errors="coerce") * factors.get(r["unit_of_wage"], 1), axis=1
    )
    print("Annualized prevailing wages created.")

if "yr_of_estab" in df_clean.columns:
    years = pd.to_numeric(df_clean["yr_of_estab"], errors="coerce")
    valid = years.between(1800, 2025)
    removed = (~valid).sum()
    df_clean = df_clean.loc[valid]
    print(f"Invalid establishment years removed: {removed}")

df_clean["case_status"] = df_clean["case_status"].map({"DENIED":0, "CERTIFIED":1})
df_clean.dropna(subset=["case_status"], inplace=True)
df_clean["case_status"] = df_clean["case_status"].astype(int)

if "case_id" in df_clean.columns:
    df_clean.drop(columns=["case_id"], inplace=True)

print("Final shape after cleaning:", df_clean.shape)
display(df_clean.head())

Negative values corrected in 'no_of_employees': 33
Annualized prevailing wages created.
Invalid establishment years removed: 0
Final shape after cleaning: (25480, 12)

# ============================================================
# 5) Exploratory Analysis (Univariate)
# ============================================================
num_cols = df_clean.select_dtypes(include="number").columns.tolist()
cat_cols = df_clean.select_dtypes(exclude="number").columns.tolist()

for c in [x for x in num_cols if x != "case_status"]:
    fig, axs = plt.subplots(2, 1, figsize=(10, 6), gridspec_kw={'height_ratios':[1,2]}, sharex=True)
    sns.boxplot(x=df_clean[c], ax=axs[0])
    sns.histplot(df_clean[c], kde=True, ax=axs[1])
    axs[0].set_title(f"Boxplot — {c}")
    axs[1].set_title(f"Histogram — {c}")
    plt.tight_layout(); plt.show()

for c in [x for x in cat_cols if x.upper() != "CASE_STATUS"]:
    plt.figure(figsize=(10,4))
    order = df_clean[c].value_counts().index
    sns.countplot(data=df_clean, x=c, order=order)
    plt.title(f"Counts — {c}")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout(); plt.show()

# ============================================================
# 6) Exploratory Analysis (Bivariate)
# ============================================================
target = "case_status"

for c in [x for x in num_cols if x != target]:
    fig, axs = plt.subplots(1, 2, figsize=(12, 4))
    sns.boxplot(data=df_clean, x=target, y=c, ax=axs[0])
    sns.kdeplot(data=df_clean, x=c, hue=target, common_norm=False, ax=axs[1])
    axs[0].set_title(f"{c} by Case Outcome")
    axs[1].set_title(f"Density of {c} by Case Outcome")
    plt.tight_layout(); plt.show()

def stacked_bar(data, feature):
    ctab = pd.crosstab(data[feature], data[target], normalize="index")*100
    ctab.plot(kind="bar", stacked=True, figsize=(10,4))
    plt.title(f"{feature}: % Certified vs Denied")
    plt.ylabel("Percentage")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

for c in [x for x in cat_cols if x.upper() != "CASE_STATUS"]:
    stacked_bar(df_clean, c)

plt.figure(figsize=(8,6))
sns.heatmap(df_clean.select_dtypes(include="number").corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Map (Numeric Features)")
plt.tight_layout(); plt.show()

# ============================================================
# 7) Analytical Insights
# ============================================================
def cert_rate_by(df, col):
    table = df.groupby(col)[target].mean().rename("cert_rate").to_frame()
    table["count"] = df.groupby(col)[target].size()
    return table.sort_values("cert_rate", ascending=False)

df_clean["wage_bin"] = pd.qcut(df_clean["prevailing_wage_yearly"], q=4, duplicates="drop")
high_cut = df_clean["prevailing_wage_yearly"].quantile(0.75)
df_highpay = df_clean[df_clean["prevailing_wage_yearly"] >= high_cut]

edu_highpay = cert_rate_by(df_highpay, "education_of_employee")
continent_cert = cert_rate_by(df_clean, "continent")
exp_cert = cert_rate_by(df_clean, "has_job_experience")
region_wage = df_clean.groupby("region_of_employment")["prevailing_wage_yearly"].agg(["count","median","mean"]).sort_values("median", ascending=False)
wage_cert = cert_rate_by(df_clean, "wage_bin")
unit_cert = cert_rate_by(df_clean, "unit_of_wage")

print("Education among high-paid roles:")
display(edu_highpay)
print("\nCertification rate by continent:")
display(continent_cert)
print("\nCertification rate by prior experience:")
display(exp_cert)
print("\nPrevailing wage by region:")
display(region_wage)
print("\nCertification by wage quartile:")
display(wage_cert)
print("\nCertification by wage unit:")
display(unit_cert)

Education among high-paid roles:

Certification rate by continent:

Certification rate by prior experience:

Prevailing wage by region:

Certification by wage quartile:

Certification by wage unit:

# ============================================================
# 8) Data Preparation for Modeling
# ============================================================
X = df_clean.drop(columns=[target])
y = df_clean[target].astype(int)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=RANDOM_STATE)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=RANDOM_STATE)

num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(exclude="number").columns.tolist()

numeric_tf = Pipeline([("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())])
categorical_tf = Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("encode", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer([("num", numeric_tf, num_cols), ("cat", categorical_tf, cat_cols)], remainder="drop")
print("Data preprocessed successfully.")

Data preprocessed successfully.

# ============================================================
# 9) Metric and Visualization Utilities
# ============================================================
def evaluate(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:,1] if hasattr(model.named_steps["clf"], "predict_proba") else None
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred)
    rec = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    auc = roc_auc_score(y, y_prob) if y_prob is not None else np.nan
    cm = confusion_matrix(y, y_pred)
    tn, fp, fn, tp = cm.ravel()
    spec = tn/(tn+fp)
    return acc, prec, rec, f1, auc, spec, cm

def plot_cm(cm, title="Confusion Matrix"):
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens",
                xticklabels=["Pred Denied","Pred Certified"],
                yticklabels=["Actual Denied","Actual Certified"])
    plt.title(title)
    plt.tight_layout(); plt.show()

# ============================================================
# 10) Baseline Models on ORIGINAL TRAIN (No Resampling)
#     - Class weights emphasize DENIED (0)
#     - Validate on untouched VALID split
# ============================================================
def make_plain(model):
    return Pipeline([("pre", preprocessor), ("clf", model)])

# Compute weights from the train split (robust to class ratio)
classes = np.array([0,1])
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = {0: float(weights[0]), 1: float(weights[1])}
print("Class weights (train-derived):", class_weights)

orig_models = {
    "Logistic (weighted)":   make_plain(LogisticRegression(
        solver="liblinear", class_weight=class_weights, max_iter=3000, random_state=RANDOM_STATE)),
    "Decision Tree (weighted)": make_plain(DecisionTreeClassifier(
        class_weight=class_weights, random_state=RANDOM_STATE)),
    "Random Forest (weighted)": make_plain(RandomForestClassifier(
        n_estimators=400, class_weight=class_weights, n_jobs=-1, random_state=RANDOM_STATE)),
    "Bagging (DT base)":     make_plain(BaggingClassifier(
        n_estimators=300, random_state=RANDOM_STATE)),
    "AdaBoost":              make_plain(AdaBoostClassifier(
        n_estimators=400, random_state=RANDOM_STATE)),
    "Gradient Boosting":     make_plain(GradientBoostingClassifier(
        random_state=RANDOM_STATE)),
}

# Add XGBoost if available
if XGB_OK:
    orig_models["XGBoost"] = make_plain(XGBClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500, learning_rate=0.1, max_depth=4,
        subsample=0.9, colsample_bytree=0.9, eval_metric="logloss",
        tree_method="hist", n_jobs=-1))

rows, fitted_orig = [], {}
for name, pipe in orig_models.items():
    pipe.fit(X_train, y_train)
    acc, prec, rec, f1, auc, spec, cm = evaluate(pipe, X_valid, y_valid)
    rows.append([name, acc, prec, rec, f1, auc, spec])
    fitted_orig[name] = pipe

orig_perf = (pd.DataFrame(rows, columns=["Model","Accuracy","Precision(1)","Recall(1)",
                                        "F1(1)","ROC_AUC","Spec(0)"])
             .sort_values(by=["F1(1)","ROC_AUC"], ascending=False).reset_index(drop=True))

print("=== ORIGINAL TRAIN — Validation Results ===")
display(orig_perf.round(4))

Class weights (train-derived): {0: 1.5056135513098285, 1: 0.7486044461854863}
=== ORIGINAL TRAIN — Validation Results ===

# ============================================================
# 11) Model Building — OVERSAMPLED TRAIN (SMOTE)
#     - Resample only inside training fold via Pipeline
#     - Evaluate on untouched VALID set
# ============================================================

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

def make_smote(model):
    return ImbPipeline([
        ("pre", preprocessor),
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", model)
    ])

smote_models = {
    "Logistic + SMOTE (weighted)": make_smote(LogisticRegression(
        solver="liblinear", class_weight=class_weights, max_iter=3000, random_state=RANDOM_STATE)),
    "Decision Tree + SMOTE (weighted)": make_smote(DecisionTreeClassifier(
        class_weight=class_weights, random_state=RANDOM_STATE)),
    "Random Forest + SMOTE (weighted)": make_smote(RandomForestClassifier(
        n_estimators=400, class_weight=class_weights, n_jobs=-1, random_state=RANDOM_STATE)),
    "Gradient Boosting + SMOTE": make_smote(GradientBoostingClassifier(random_state=RANDOM_STATE)),
    "AdaBoost + SMOTE": make_smote(AdaBoostClassifier(n_estimators=400, random_state=RANDOM_STATE))
}

rows, fitted_smote = [], {}
for name, pipe in smote_models.items():
    pipe.fit(X_train, y_train)
    acc, prec, rec, f1, auc, spec, cm = evaluate(pipe, X_valid, y_valid)
    rows.append([name, acc, prec, rec, f1, auc, spec])
    fitted_smote[name] = pipe

smote_perf = pd.DataFrame(rows, columns=["Model","Accuracy","Precision(1)","Recall(1)",
                                         "F1(1)","ROC_AUC","Spec(0)"])\
                            .sort_values(by=["F1(1)","Spec(0)"], ascending=False)\
                            .reset_index(drop=True)

print("=== SMOTE TRAIN — Validation Results ===")
display(smote_perf.round(4))

=== SMOTE TRAIN — Validation Results ===

# ============================================================
# 12) Model Building — UNDERSAMPLED TRAIN
#     - Undersample only inside training fold via Pipeline
#     - Evaluate on untouched VALID set
# ============================================================

from imblearn.under_sampling import RandomUnderSampler

def make_under(model):
    return ImbPipeline([
        ("pre", preprocessor),
        ("under", RandomUnderSampler(random_state=RANDOM_STATE)),
        ("clf", model)
    ])

under_models = {
    "Logistic + Under (weighted)": make_under(LogisticRegression(
        solver="liblinear", class_weight=class_weights, max_iter=3000, random_state=RANDOM_STATE)),
    "Decision Tree + Under (weighted)": make_under(DecisionTreeClassifier(
        class_weight=class_weights, random_state=RANDOM_STATE)),
    "Random Forest + Under (weighted)": make_under(RandomForestClassifier(
        n_estimators=400, class_weight=class_weights, n_jobs=-1, random_state=RANDOM_STATE)),
    "Gradient Boosting + Under": make_under(GradientBoostingClassifier(random_state=RANDOM_STATE)),
    "AdaBoost + Under": make_under(AdaBoostClassifier(n_estimators=400, random_state=RANDOM_STATE))
}

rows, fitted_under = [], {}
for name, pipe in under_models.items():
    pipe.fit(X_train, y_train)
    acc, prec, rec, f1, auc, spec, cm = evaluate(pipe, X_valid, y_valid)
    rows.append([name, acc, prec, rec, f1, auc, spec])
    fitted_under[name] = pipe

under_perf = pd.DataFrame(rows, columns=["Model","Accuracy","Precision(1)","Recall(1)",
                                         "F1(1)","ROC_AUC","Spec(0)"])\
                            .sort_values(by=["F1(1)","Spec(0)"], ascending=False)\
                            .reset_index(drop=True)

print("=== UNDERSAMPLED TRAIN — Validation Results ===")
display(under_perf.round(4))

=== UNDERSAMPLED TRAIN — Validation Results ===

# ============================================================
# 13) Select Top Candidates Across All Methods
# ============================================================

def pick_top(df, k=3):
    return df.sort_values(by=["F1(1)","Spec(0)"], ascending=False).head(k)["Model"].tolist()

candidates = list(dict.fromkeys(
    pick_top(orig_perf, 3) + pick_top(smote_perf, 3) + pick_top(under_perf, 3)
))

print("=== Top Candidates for Tuning ===")
for c in candidates:
    print(" -", c)

=== Top Candidates for Tuning ===
 - Gradient Boosting
 - XGBoost
 - AdaBoost
 - Random Forest + SMOTE (weighted)
 - Gradient Boosting + SMOTE
 - AdaBoost + SMOTE
 - AdaBoost + Under
 - Gradient Boosting + Under
 - Random Forest + Under (weighted)

# ============================================================
# 14) Hyperparameter Tuning (CV + Randomized Search)
#     - Focus: F1 optimization while tracking DENIED specificity
# ============================================================

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def build_model_by_name(name):
    if "Gradient Boosting" in name:
        base = GradientBoostingClassifier(random_state=RANDOM_STATE)
        pipe = make_plain(base)
        params = {
            "clf__n_estimators": [200, 300, 500],
            "clf__learning_rate": [0.05, 0.1, 0.2],
            "clf__max_depth": [2, 3, 4],
            "clf__subsample": [0.8, 1.0]
        }
        return pipe, params

    if "Random Forest" in name:
        base = RandomForestClassifier(
            n_estimators=400, class_weight=class_weights, n_jobs=-1, random_state=RANDOM_STATE)
        pipe = make_plain(base)
        params = {
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_split": [2, 5, 10],
            "clf__min_samples_leaf": [1, 2, 4],
            "clf__max_features": ["sqrt", "log2"]
        }
        return pipe, params

    if "Logistic" in name:
        base = LogisticRegression(
            solver="liblinear", class_weight=class_weights, max_iter=4000, random_state=RANDOM_STATE)
        pipe = make_plain(base)
        params = {"clf__C": loguniform(1e-3, 1e2), "clf__penalty": ["l1", "l2"]}
        return pipe, params

    return None

tuned_models, results = {}, []
for name in candidates[:3]:
    built = build_model_by_name(name)
    if built is None:
        continue
    pipe, grid = built
    tuner = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=grid,
        n_iter=20,
        scoring="f1",
        cv=cv,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        refit=True,
        verbose=0
    )
    tuner.fit(X_train, y_train)
    tuned_models[name] = tuner.best_estimator_

    acc, prec, rec, f1, auc, spec, cm = evaluate(tuner.best_estimator_, X_valid, y_valid)
    results.append([name, tuner.best_params_, acc, prec, rec, f1, auc, spec])

tuned_df = pd.DataFrame(results, columns=[
    "Model","Best Params","Accuracy","Precision(1)","Recall(1)","F1(1)","ROC_AUC","Spec(0)"
]).sort_values(by=["F1(1)","Spec(0)"], ascending=False).reset_index(drop=True)

print("=== Tuned Models — Validation ===")
display(tuned_df)

best_name = tuned_df.iloc[0]["Model"]
best_model = tuned_models[best_name]
print("Selected tuned model:", best_name)

=== Tuned Models — Validation ===

Selected tuned model: Gradient Boosting

# ============================================================
# 15) Threshold Optimization
#     - Balance Certified Recall vs Denied Specificity
# ============================================================

def proba_positive(pipe, X):
    clf = pipe.named_steps["clf"]
    if hasattr(clf, "predict_proba"):
        return pipe.predict_proba(X)[:, 1]
    raise RuntimeError("This model does not support predict_proba().")

val_prob = proba_positive(best_model, X_valid)

thresholds = np.linspace(0.2, 0.8, 121)
records = []
for t in thresholds:
    y_pred = (val_prob >= t).astype(int)
    acc = accuracy_score(y_valid, y_pred)
    prec = precision_score(y_valid, y_pred, zero_division=0)
    rec = recall_score(y_valid, y_pred, zero_division=0)
    f1 = f1_score(y_valid, y_pred, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_valid, y_pred, labels=[0,1]).ravel()
    spec = tn / (tn + fp) if (tn + fp) else 0
    score = 0.6*f1 + 0.4*spec
    records.append([t, acc, prec, rec, f1, spec, score])

thresh_df = pd.DataFrame(records, columns=["Threshold","Accuracy","Precision(1)","Recall(1)",
                                           "F1(1)","Spec(0)","Score"])\
                        .sort_values(by="Score", ascending=False).reset_index(drop=True)

best_thr = float(thresh_df.iloc[0]["Threshold"])
print("Best operating threshold:", round(best_thr, 3))
display(thresh_df.head(10))

Best operating threshold: 0.745

# ============================================================
# 16) Validation Threshold Sweep — Multi-Threshold Confusion Metrics
#     - Computes metrics at multiple thresholds (incl. default 0.50)
#     - Finds: best-F1 threshold, "even" (recall vs specificity) threshold
#     - Shows confusion stats for a curated set of thresholds
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix
)

# In case not already defined earlier
def proba_positive(est, X):
    clf = est.named_steps["clf"]
    if hasattr(clf, "predict_proba"):
        return est.predict_proba(X)[:, 1]
    raise RuntimeError("Estimator lacks predict_proba for threshold tuning.")

def eval_threshold(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
    rec  = recall_score(y_true, y_pred, pos_label=1, zero_division=0)   # Certified recall
    f1c  = f1_score(y_true, y_pred, pos_label=1, zero_division=0)
    cm   = confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    spec = tn/(tn+fp) if (tn+fp) else 0.0                               # Denied specificity
    cert_ok = tp/(tp+fn) if (tp+fn) else 0.0
    den_ok  = tn/(tn+fp) if (tn+fp) else 0.0
    return {
        "threshold": thr, "accuracy": acc, "precision(1)": prec, "recall(1)": rec,
        "f1(1)": f1c, "spec(0)": spec, "cert_ok%": cert_ok, "denied_ok%": den_ok,
        "tn": tn, "fp": fp, "fn": fn, "tp": tp, "cm": cm
    }

def summarize_thresholds(y_true, y_prob, thr_list, title_prefix="Validation", show_heatmaps=False):
    rows = []
    for t in thr_list:
        rows.append(eval_threshold(y_true, y_prob, t))
    df = pd.DataFrame(rows).drop(columns=["cm"])
    df_sorted = df.sort_values(by=["f1(1)", "spec(0)"], ascending=False).reset_index(drop=True)
    display(df_sorted.round(4))

    if show_heatmaps:
        for r in rows:
            cm = r["cm"]
            cm_df = pd.DataFrame(cm,
                                 index=["Actual DENIED (0)", "Actual CERTIFIED (1)"],
                                 columns=["Pred DENIED (0)", "Pred CERTIFIED (1)"])
            plt.figure(figsize=(5.2,4.2))
            sns.heatmap(cm_df, annot=True, fmt="d", cmap="Purples", cbar=True, linewidths=.5)
            plt.title(f"{title_prefix} — Confusion Matrix @ Thr {r['threshold']:.3f}")
            plt.tight_layout(); plt.show()
    return df_sorted

# ---- Run on VALIDATION ----
val_prob = proba_positive(best_model, X_valid)

# Fine sweep to find best-F1 and best "even" (harmonic mean of recall & specificity)
grid = np.linspace(0.20, 0.80, 121)
scan = [eval_threshold(y_valid, val_prob, t) for t in grid]
scan_df = pd.DataFrame(scan).drop(columns=["cm"])
scan_df["even_hmean"] = (2*scan_df["recall(1)"]*scan_df["spec(0)"]) / (
    (scan_df["recall(1)"] + scan_df["spec(0)"]).replace(0, np.nan)
)

VAL_best_f1_threshold   = float(scan_df.loc[scan_df["f1(1)"].idxmax(), "threshold"])
VAL_even_threshold      = float(scan_df.loc[scan_df["even_hmean"].idxmax(), "threshold"])
VAL_default_threshold   = 0.50

print(f"Validation best-F1 threshold   : {VAL_best_f1_threshold:.3f}")
print(f"Validation even-balance thr    : {VAL_even_threshold:.3f}")
print(f"Validation default threshold   : {VAL_default_threshold:.3f}")

# Curated set of thresholds to inspect on VALIDATION
val_thresholds_to_view = sorted({
    VAL_default_threshold,
    round(VAL_best_f1_threshold, 3),
    round(VAL_even_threshold, 3),
    0.60, 0.65, 0.68, 0.70
})

print("\n=== Validation — Metrics at Selected Thresholds ===")
val_table = summarize_thresholds(y_valid, val_prob, val_thresholds_to_view,
                                 title_prefix="Validation", show_heatmaps=False)

Validation best-F1 threshold   : 0.430
Validation even-balance thr    : 0.680
Validation default threshold   : 0.500

=== Validation — Metrics at Selected Thresholds ===

# ============================================================
# 17) Threshold Sweep with Key Operating Points
#     - Best F1 (Certified)
#     - Balanced predicted counts (≈ equal Certified vs Denied predictions)
#     - Parity of correctness (Certified recall ≈ Denied specificity)
#     - Always include default 0.50
#     Prints counts, % correct/incorrect, and confusion matrices
# ============================================================

import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)

# --- Choose dataset to inspect ---
DATASET = "test"   # "valid" or "test"
if DATASET == "valid":
    X_eval, y_eval = X_valid, y_valid
else:
    X_eval, y_eval = X_test, y_test

# --- Probabilities from the selected model ---
if not hasattr(best_model.named_steps["clf"], "predict_proba"):
    raise RuntimeError("The selected model does not support predict_proba; threshold tuning requires probabilities.")

y_prob = best_model.predict_proba(X_eval)[:, 1]  # P(CERTIFIED)
y_true = y_eval.values if hasattr(y_eval, "values") else y_eval

# --- Helpers ---
def eval_at_threshold(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    total = tn + fp + fn + tp

    # Core metrics
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
    rec1 = recall_score(y_true, y_pred, pos_label=1, zero_division=0)        # Certified recall (TPR for class 1)
    f1c  = f1_score(y_true, y_pred, pos_label=1, zero_division=0)
    spec0 = tn / (tn + fp) if (tn + fp) else 0.0                              # Denied specificity (TNR for class 0)

    # Predicted counts & shares
    pred_1 = (y_pred == 1).sum()
    pred_0 = (y_pred == 0).sum()
    share_1 = pred_1 / total
    share_0 = pred_0 / total

    # Class-wise correctness
    tot_denied = tn + fp
    tot_cert   = tp + fn
    denied_correct_pct   = (tn / tot_denied) * 100 if tot_denied else 0.0
    denied_incorrect_pct = (fp / tot_denied) * 100 if tot_denied else 0.0
    cert_correct_pct     = (tp / tot_cert) * 100 if tot_cert else 0.0
    cert_incorrect_pct   = (fn / tot_cert) * 100 if tot_cert else 0.0

    return {
        "thr": thr,
        "acc": acc,
        "prec1": prec,
        "rec1": rec1,
        "spec0": spec0,
        "f1c": f1c,
        "pred_0": int(pred_0), "pred_1": int(pred_1),
        "share_0": share_0, "share_1": share_1,
        "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp),
        "denied_correct_pct": denied_correct_pct,
        "denied_incorrect_pct": denied_incorrect_pct,
        "cert_correct_pct": cert_correct_pct,
        "cert_incorrect_pct": cert_incorrect_pct,
        "cm": cm
    }

# --- Sweep thresholds ---
grid = np.linspace(0.20, 0.80, 121)
rows = [eval_at_threshold(y_true, y_prob, t) for t in grid]
df = pd.DataFrame(rows)

# --- Pick operating points ---
# 1) Default 0.50
pt_default = df.iloc[(df['thr']-0.50).abs().argmin()]

# 2) Best F1 (Certified) — strongest single-score balance for the positive class
pt_best_f1 = df.iloc[df['f1c'].values.argmax()]

# 3) Balanced predicted counts — roughly equal number of predicted 0 and 1
pt_balanced_counts = df.iloc[(df['pred_0'] - df['pred_1']).abs().argmin()]

# 4) Parity of correctness — Certified recall ≈ Denied specificity
pt_parity = df.iloc[(df['rec1'] - df['spec0']).abs().argmin()]

# --- Collate for display ---
def short_row(pt, label):
    return {
        "Point": label,
        "Threshold": round(float(pt['thr']), 3),
        "Accuracy": round(float(pt['acc']), 4),
        "Precision(1)": round(float(pt['prec1']), 4),
        "Recall(1) (Certified)": round(float(pt['rec1']), 4),
        "Specificity(0) (Denied)": round(float(pt['spec0']), 4),
        "F1(1)": round(float(pt['f1c']), 4),
        "Pred Certified (count)": int(pt['pred_1']),
        "Pred Denied (count)": int(pt['pred_0']),
        "Certified correct %": f"{pt['cert_correct_pct']:.2f}%",
        "Denied correct %": f"{pt['denied_correct_pct']:.2f}%"
    }

summary = pd.DataFrame([
    short_row(pt_default, "Default (0.50)"),
    short_row(pt_best_f1, "Best F1 (Certified)"),
    short_row(pt_balanced_counts, "Balanced Predicted Counts"),
    short_row(pt_parity, "Parity of Correctness (Recall≈Specificity)")
])

print(f"=== Threshold Summary on {DATASET.upper()} ===")
display(summary)

# --- Pretty print confusion matrices & correctness breakdown ---
def print_block(pt, label):
    cm = pt['cm']
    tn, fp, fn, tp = pt['tn'], pt['fp'], pt['fn'], pt['tp']
    print(f"\n--- {label} @ threshold={pt['thr']:.3f} ---")
    print(pd.DataFrame(cm,
                       index=["Actual DENIED (0)", "Actual CERTIFIED (1)"],
                       columns=["Pred DENIED (0)", "Pred CERTIFIED (1)"]))
    print(f"\nCounts:")
    print(f"  Predicted CERTIFIED (1): {pt['pred_1']}")
    print(f"  Predicted DENIED (0)   : {pt['pred_0']}")

    print(f"\nCorrectness (% of each actual class):")
    print(f"  Certified correctly predicted: {pt['cert_correct_pct']:.2f}%   (Incorrect: {pt['cert_incorrect_pct']:.2f}%)")
    print(f"  Denied correctly predicted   : {pt['denied_correct_pct']:.2f}%   (Incorrect: {pt['denied_incorrect_pct']:.2f}%)")

    print(f"\nMetrics:")
    print(f"  Accuracy: {pt['acc']:.4f} | Precision(1): {pt['prec1']:.4f} | Recall(1): {pt['rec1']:.4f} | Specificity(0): {pt['spec0']:.4f} | F1(1): {pt['f1c']:.4f}")

print_block(pt_default, "Default (0.50)")
print_block(pt_best_f1, "Best F1 (Certified)")
print_block(pt_balanced_counts, "Balanced Predicted Counts")
print_block(pt_parity, "Parity of Correctness (Recall≈Specificity)")

=== Threshold Summary on TEST ===

--- Default (0.50) @ threshold=0.500 ---
                      Pred DENIED (0)  Pred CERTIFIED (1)
Actual DENIED (0)                 857                 836
Actual CERTIFIED (1)              460                2943

Counts:
  Predicted CERTIFIED (1): 3779
  Predicted DENIED (0)   : 1317

Correctness (% of each actual class):
  Certified correctly predicted: 86.48%   (Incorrect: 13.52%)
  Denied correctly predicted   : 50.62%   (Incorrect: 49.38%)

Metrics:
  Accuracy: 0.7457 | Precision(1): 0.7788 | Recall(1): 0.8648 | Specificity(0): 0.5062 | F1(1): 0.8195

--- Best F1 (Certified) @ threshold=0.480 ---
                      Pred DENIED (0)  Pred CERTIFIED (1)
Actual DENIED (0)                 832                 861
Actual CERTIFIED (1)              428                2975

Counts:
  Predicted CERTIFIED (1): 3836
  Predicted DENIED (0)   : 1260

Correctness (% of each actual class):
  Certified correctly predicted: 87.42%   (Incorrect: 12.58%)
  Denied correctly predicted   : 49.14%   (Incorrect: 50.86%)

Metrics:
  Accuracy: 0.7471 | Precision(1): 0.7755 | Recall(1): 0.8742 | Specificity(0): 0.4914 | F1(1): 0.8219

--- Balanced Predicted Counts @ threshold=0.710 ---
                      Pred DENIED (0)  Pred CERTIFIED (1)
Actual DENIED (0)                1282                 411
Actual CERTIFIED (1)             1243                2160

Counts:
  Predicted CERTIFIED (1): 2571
  Predicted DENIED (0)   : 2525

Correctness (% of each actual class):
  Certified correctly predicted: 63.47%   (Incorrect: 36.53%)
  Denied correctly predicted   : 75.72%   (Incorrect: 24.28%)

Metrics:
  Accuracy: 0.6754 | Precision(1): 0.8401 | Recall(1): 0.6347 | Specificity(0): 0.7572 | F1(1): 0.7231

--- Parity of Correctness (Recall≈Specificity) @ threshold=0.680 ---
                      Pred DENIED (0)  Pred CERTIFIED (1)
Actual DENIED (0)                1180                 513
Actual CERTIFIED (1)             1009                2394

Counts:
  Predicted CERTIFIED (1): 2907
  Predicted DENIED (0)   : 2189

Correctness (% of each actual class):
  Certified correctly predicted: 70.35%   (Incorrect: 29.65%)
  Denied correctly predicted   : 69.70%   (Incorrect: 30.30%)

Metrics:
  Accuracy: 0.7013 | Precision(1): 0.8235 | Recall(1): 0.7035 | Specificity(0): 0.6970 | F1(1): 0.7588

# ============================================================
# 18) Business Alignment: Outcomes vs. Problem Statement
#     - Quantify how we facilitate approvals AND catch denials
#     - Report results at: default 0.50, Best-F1(thr on VALID), and Even(Recall≈Specificity)
#     - Summarize in business language + technical metrics
#     - Show top drivers (features) used for recommendations
# ============================================================

import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)

# -------- Helpers --------
def proba_positive(est, X):
    """P(CERTIFIED=1)."""
    clf = est.named_steps["clf"]
    if hasattr(clf, "predict_proba"):
        return est.predict_proba(X)[:, 1]
    raise RuntimeError("Selected model does not provide predict_proba; required for threshold analysis.")

def evaluate_at_threshold(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
    rec1 = recall_score(y_true, y_pred, pos_label=1, zero_division=0)      # Certified recall
    f1c  = f1_score(y_true, y_pred, pos_label=1, zero_division=0)
    spec0 = tn / (tn + fp) if (tn + fp) else 0.0                            # Denied specificity
    auc  = roc_auc_score(y_true, y_prob)

    total_denied = tn + fp
    total_cert   = tp + fn
    out = {
        "Threshold": thr,
        "Accuracy": acc,
        "Precision(1)": prec,
        "Recall(1)_Certified": rec1,
        "Specificity(0)_Denied": spec0,
        "F1(1)": f1c,
        "ROC_AUC": auc,
        "TP": int(tp), "FP": int(fp), "TN": int(tn), "FN": int(fn),
        "Total_Denied": int(total_denied), "Total_Certified": int(total_cert),
        "Cert_Correct_%": (tp / total_cert * 100) if total_cert else 0.0,
        "Denied_Correct_%": (tn / total_denied * 100) if total_denied else 0.0
    }
    return out, cm

def best_f1_threshold_from_valid(model, X_valid, y_valid):
    """Pick threshold that maximizes F1 on VALID; fallback to 0.50 if anything goes wrong."""
    try:
        vprob = proba_positive(model, X_valid)
        grid = np.linspace(0.20, 0.80, 121)
        best_t, best_f1 = 0.50, -1
        for t in grid:
            ypv = (vprob >= t).astype(int)
            f1c = f1_score(y_valid, ypv, pos_label=1, zero_division=0)
            if f1c > best_f1:
                best_f1, best_t = f1c, t
        return float(best_t)
    except Exception:
        return 0.50

def even_threshold_from_valid(model, X_valid, y_valid):
    """Pick threshold that balances Certified recall and Denied specificity on VALID."""
    try:
        vprob = proba_positive(model, X_valid)
        grid = np.linspace(0.20, 0.80, 121)
        best_t, best_hm = 0.50, -1
        for t in grid:
            ypv = (vprob >= t).astype(int)
            cm = confusion_matrix(y_valid, ypv, labels=[0,1])
            tn, fp, fn, tp = cm.ravel()
            rec1 = tp / (tp + fn) if (tp + fn) else 0.0
            spec0 = tn / (tn + fp) if (tn + fp) else 0.0
            hm = (2 * rec1 * spec0) / (rec1 + spec0) if (rec1 + spec0) else 0.0
            if hm > best_hm:
                best_hm, best_t = hm, t
        return float(best_t)
    except Exception:
        return 0.50

# -------- Determine thresholds (use existing ones if defined earlier) --------
try:
    THR_BEST_F1 = VAL_best_f1_threshold  # from earlier cell if present
except NameError:
    THR_BEST_F1 = best_f1_threshold_from_valid(best_model, X_valid, y_valid)

try:
    THR_EVEN = EVEN_thr  # from earlier cell if present
except NameError:
    THR_EVEN = even_threshold_from_valid(best_model, X_valid, y_valid)

THR_DEFAULT = 0.50

# -------- Evaluate on TEST (business acceptance requires held-out test) --------
test_prob = proba_positive(best_model, X_test)

res_default, cm_default = evaluate_at_threshold(y_test, test_prob, THR_DEFAULT)
res_bestf1, cm_bestf1   = evaluate_at_threshold(y_test, test_prob, THR_BEST_F1)
res_even, cm_even       = evaluate_at_threshold(y_test, test_prob, THR_EVEN)

summary = pd.DataFrame([
    {"Point": "Default (0.50)", **{k: v for k, v in res_default.items() if k not in ["TP","FP","TN","FN"]}},
    {"Point": "Best-F1 (Valid-chosen)", **{k: v for k, v in res_bestf1.items() if k not in ["TP","FP","TN","FN"]}},
    {"Point": "Even Recall/Specificity", **{k: v for k, v in res_even.items() if k not in ["TP","FP","TN","FN"]}},
]).round(4)

print("=== Business Alignment Summary (TEST) ===")
display(summary)

def print_confusion_block(name, res, cm):
    print(f"\n--- {name} @ threshold={res['Threshold']:.3f} ---")
    print(pd.DataFrame(cm,
                       index=["Actual DENIED (0)", "Actual CERTIFIED (1)"],
                       columns=["Pred DENIED (0)", "Pred CERTIFIED (1)"]))
    print(f"Certified correctly predicted: {res['Cert_Correct_%']:.2f}% "
          f"({res['TP']} / {res['Total_Certified']})")
    print(f"Denied    correctly predicted: {res['Denied_Correct_%']:.2f}% "
          f"({res['TN']} / {res['Total_Denied']})")
    print(f"Accuracy={res['Accuracy']:.4f} | Precision(1)={res['Precision(1)']:.4f} | "
          f"Recall(1)={res['Recall(1)_Certified']:.4f} | Specificity(0)={res['Specificity(0)_Denied']:.4f} | "
          f"F1(1)={res['F1(1)']:.4f} | ROC_AUC={res['ROC_AUC']:.4f}")

print_confusion_block("Default (0.50)", res_default, cm_default)
print_confusion_block("Best-F1 (Valid-chosen)", res_bestf1, cm_bestf1)
print_confusion_block("Even Recall/Specificity", res_even, cm_even)

# -------- Drivers of recommendation (top features) --------
# Try to extract model-native importances; fallback to permutation on VALID if needed.
def get_feature_names_from_ct(ct):
    """Get transformed feature names from ColumnTransformer."""
    feat_names = []
    for name, trans, cols in ct.transformers_:
        if name == 'remainder' and trans == 'drop':
            continue
        if hasattr(trans, 'get_feature_names_out'):
            # Pipelines: last step may be OHE
            try:
                fn = trans.get_feature_names_out(cols)
            except:
                fn = cols
        else:
            fn = cols
        feat_names.extend(list(fn))
    return np.array(feat_names, dtype=object)

X_names = get_feature_names_from_ct(best_model.named_steps["pre"])

importances = None
clf = best_model.named_steps["clf"]
if hasattr(clf, "feature_importances_"):
    importances = clf.feature_importances_
elif hasattr(clf, "coef_"):
    # Use absolute coefficients (multi-dim safety)
    coef = np.ravel(clf.coef_)
    importances = np.abs(coef)

top_feat_df = None
if importances is not None and len(importances) == len(X_names):
    idx = np.argsort(importances)[::-1][:15]
    top_feat_df = pd.DataFrame({
        "Feature": X_names[idx],
        "Importance": importances[idx]
    })
    print("\nTop drivers (model-native importance/coefficient):")
    display(top_feat_df)
else:
    # Lightweight permutation importance on VALID to avoid leakage
    try:
        from sklearn.inspection import permutation_importance
        val_imp = permutation_importance(best_model, X_valid, y_valid, n_repeats=5, random_state=42, scoring="f1")
        idx = np.argsort(val_imp.importances_mean)[::-1][:15]
        top_feat_df = pd.DataFrame({
            "Feature": X_names[idx],
            "Importance": val_imp.importances_mean[idx]
        })
        print("\nTop drivers (permutation importance on VALID):")
        display(top_feat_df)
    except Exception as e:
        print("\nSkipping feature importance (not supported for this model).", e)

# -------- Executive-readable mapping to the objective --------
def pct(x):
    return f"{100*x:.1f}%"

# Pick the most “balanced” operating point for narrative (Even or Best-F1 depending on context)
narr = res_even if abs(res_even["Recall(1)_Certified"] - res_even["Specificity(0)_Denied"]) <= \
                  abs(res_bestf1["Recall(1)_Certified"] - res_bestf1["Specificity(0)_Denied"]) else res_bestf1
narr_label = "Even Recall/Specificity" if narr is res_even else "Best-F1"

print("\n=== Business Narrative (TEST) ===")
print(
    f"At the '{narr_label}' operating point (threshold={narr['Threshold']:.3f}), the model correctly identifies "
    f"{narr['Cert_Correct_%']:.1f}% of certified cases and {narr['Denied_Correct_%']:.1f}% of denied cases, "
    f"with overall accuracy {pct(narr['Accuracy'])}, F1 for certified {pct(narr['F1(1)'])}, and ROC-AUC {pct(narr['ROC_AUC'])}."
)
print(
    "This directly supports the objective to facilitate approvals (high true-positive rate on certifications) "
    "while protecting U.S. workers by screening out weak applications (strong true-negative rate on denials)."
)

if top_feat_df is not None:
    top_txt = ", ".join(top_feat_df['Feature'].head(5).tolist())
    print(f"Key decision drivers include: {top_txt}. These features inform which profiles are more likely to be certified or denied.")

print("\nInterpretation for stakeholders:")
print(
    "- For operations: the chosen threshold sets how aggressive the shortlisting is. Moving it higher increases the share of predicted denials (more conservative), "
    "moving it lower increases predicted certifications (more throughput)."
)
print(
    "- For compliance and risk: the Even setting balances catching denials with approving qualified applicants, which is easier to justify in audits."
)

=== Business Alignment Summary (TEST) ===

--- Default (0.50) @ threshold=0.500 ---
                      Pred DENIED (0)  Pred CERTIFIED (1)
Actual DENIED (0)                 857                 836
Actual CERTIFIED (1)              460                2943
Certified correctly predicted: 86.48% (2943 / 3403)
Denied    correctly predicted: 50.62% (857 / 1693)
Accuracy=0.7457 | Precision(1)=0.7788 | Recall(1)=0.8648 | Specificity(0)=0.5062 | F1(1)=0.8195 | ROC_AUC=0.7748

--- Best-F1 (Valid-chosen) @ threshold=0.430 ---
                      Pred DENIED (0)  Pred CERTIFIED (1)
Actual DENIED (0)                 622                1071
Actual CERTIFIED (1)              294                3109
Certified correctly predicted: 91.36% (3109 / 3403)
Denied    correctly predicted: 36.74% (622 / 1693)
Accuracy=0.7321 | Precision(1)=0.7438 | Recall(1)=0.9136 | Specificity(0)=0.3674 | F1(1)=0.8200 | ROC_AUC=0.7748

--- Even Recall/Specificity @ threshold=0.680 ---
                      Pred DENIED (0)  Pred CERTIFIED (1)
Actual DENIED (0)                1180                 513
Actual CERTIFIED (1)             1009                2394
Certified correctly predicted: 70.35% (2394 / 3403)
Denied    correctly predicted: 69.70% (1180 / 1693)
Accuracy=0.7013 | Precision(1)=0.8235 | Recall(1)=0.7035 | Specificity(0)=0.6970 | F1(1)=0.7588 | ROC_AUC=0.7748

Top drivers (model-native importance/coefficient):

=== Business Narrative (TEST) ===
At the 'Even Recall/Specificity' operating point (threshold=0.680), the model correctly identifies 70.3% of certified cases and 69.7% of denied cases, with overall accuracy 70.1%, F1 for certified 75.9%, and ROC-AUC 77.5%.
This directly supports the objective to facilitate approvals (high true-positive rate on certifications) while protecting U.S. workers by screening out weak applications (strong true-negative rate on denials).
Key decision drivers include: education_of_employee_HIGH SCHOOL, has_job_experience_NO, education_of_employee_BACHELOR'S, prevailing_wage, continent_EUROPE. These features inform which profiles are more likely to be certified or denied.

Interpretation for stakeholders:
- For operations: the chosen threshold sets how aggressive the shortlisting is. Moving it higher increases the share of predicted denials (more conservative), moving it lower increases predicted certifications (more throughput).
- For compliance and risk: the Even setting balances catching denials with approving qualified applicants, which is easier to justify in audits.

	count	mean	std	min	25%	50%	75%	max
no_of_employees	25480.0	5667.043210	22877.928848	-26.0000	1022.00	2109.00	3504.0000	602069.00
yr_of_estab	25480.0	1979.409929	42.366929	1800.0000	1976.00	1997.00	2005.0000	2016.00
prevailing_wage	25480.0	74455.814592	52815.942327	2.1367	34015.48	70308.21	107735.5125	319210.27

	cert_rate	count
education_of_employee
DOCTORATE	0.772871	317
MASTER'S	0.708366	2534
BACHELOR'S	0.520325	2583
HIGH SCHOOL	0.323718	936

	cert_rate	count
continent
EUROPE	0.792337	3732
AFRICA	0.720508	551
ASIA	0.653105	16861
OCEANIA	0.635417	192
NORTH AMERICA	0.618773	3292
SOUTH AMERICA	0.578638	852

	count	median	mean
region_of_employment
ISLAND	375	96317.45	170732.100005
MIDWEST	4307	94520.54	159357.064221
SOUTH	7017	84812.35	208889.577465
NORTHEAST	7195	81428.85	224671.577777
WEST	6586	73867.56	181728.067983

	cert_rate	count
wage_bin
(99.999, 47107.965]	0.711617	6370
(47107.965, 82839.46]	0.696389	6370
(82839.46, 124825.035]	0.684772	6370
(124825.035, 14569149.4]	0.578807	6370

Executive Summary¶

Problem Statement¶

0. Library Installation¶

2. Load Dataset¶

2. Load Dataset¶

3. Data Understanding (Types, Missingness, Target Balance)¶

4. Primary Cleaning (Essential, Non-destructive)¶

5. Exploratory Data Analysis (Univariate)¶

6. Exploratory Data Analysis (Bivariate)¶

7. Analytical Insights¶

8. Data Preparation for Modeling¶

9. Metric and Visualization Utilities¶

10. Baseline Models on ORIGINAL TRAIN (No Resampling)¶

11. Model Building — OVERSAMPLED TRAIN (SMOTE)¶

12. Model Building — UNDERSAMPLED TRAIN¶

13. Select Top Candidates Across All Methods¶

14. Hyperparameter Tuning (CV + Randomized Search)¶

15. Threshold Optimization¶

16. Validation Threshold Sweep — Multi-Threshold Confusion Metrics¶

17. Threshold Sweep with Key Operating Points¶

18. Business Alignment: Outcomes vs. Problem Statement¶

Comprehensive Summary¶

	case_id	continent	education_of_employee	has_job_experience	requires_job_training	no_of_employees	yr_of_estab	region_of_employment	prevailing_wage	unit_of_wage	full_time_position	case_status
0	EZYV01	Asia	High School	N	N	14513	2007	West	592.2029	Hour	Y	Denied
1	EZYV02	Asia	Master's	Y	N	2412	2002	Northeast	83425.6500	Year	Y	Certified
2	EZYV03	Asia	Bachelor's	N	Y	44444	2008	West	122996.8600	Year	Y	Denied
3	EZYV04	Asia	Bachelor's	N	N	98	1897	West	83434.0300	Year	Y	Denied
4	EZYV05	Africa	Master's	Y	N	1082	2005	South	149907.3900	Year	Y	Certified

	case_id	continent	education_of_employee	has_job_experience	requires_job_training	no_of_employees	yr_of_estab	region_of_employment	prevailing_wage	unit_of_wage	full_time_position	case_status
25475	EZYV25476	Asia	Bachelor's	Y	Y	2601	2008	South	77092.57	Year	Y	Certified
25476	EZYV25477	Asia	High School	Y	N	3274	2006	Northeast	279174.79	Year	Y	Certified
25477	EZYV25478	Asia	Master's	Y	N	1121	1910	South	146298.85	Year	N	Certified
25478	EZYV25479	Asia	Master's	Y	Y	1918	1887	West	86154.77	Year	Y	Certified
25479	EZYV25480	Asia	Bachelor's	Y	N	3195	1960	Midwest	70876.91	Year	Y	Certified

	0
case_id	0
continent	0
education_of_employee	0
has_job_experience	0
requires_job_training	0
no_of_employees	0
yr_of_estab	0
region_of_employment	0
prevailing_wage	0
unit_of_wage	0
full_time_position	0
case_status	0

	cert_rate	count
unit_of_wage
YEAR	0.698850	22962
WEEK	0.621324	272
MONTH	0.617978	89
HOUR	0.346314	2157

	Model	Accuracy	Precision(1)	Recall(1)	F1(1)	ROC_AUC	Spec(0)
0	Gradient Boosting	0.7447	0.7743	0.8719	0.8202	0.7753	0.4888
1	XGBoost	0.7380	0.7716	0.8634	0.8149	0.7643	0.4858
2	AdaBoost	0.7284	0.7545	0.8796	0.8123	0.7649	0.4243
3	Random Forest (weighted)	0.7321	0.7686	0.8569	0.8104	0.7535	0.4811
4	Bagging (DT base)	0.7239	0.7671	0.8425	0.8030	0.7503	0.4852
5	Logistic (weighted)	0.7051	0.8124	0.7262	0.7669	0.7671	0.6625
6	Decision Tree (weighted)	0.6668	0.7434	0.7653	0.7542	0.6170	0.4687

	Model	Accuracy	Precision(1)	Recall(1)	F1(1)	ROC_AUC	Spec(0)
0	Random Forest + SMOTE (weighted)	0.7102	0.7820	0.7850	0.7835	0.7497	0.5597
1	Gradient Boosting + SMOTE	0.7113	0.8095	0.7427	0.7746	0.7725	0.6483
2	AdaBoost + SMOTE	0.7023	0.7996	0.7397	0.7685	0.7607	0.6271
3	Decision Tree + SMOTE (weighted)	0.6529	0.7519	0.7168	0.7339	0.6205	0.5242
4	Logistic + SMOTE (weighted)	0.6195	0.8786	0.4994	0.6368	0.7664	0.8611

	Model	Accuracy	Precision(1)	Recall(1)	F1(1)	ROC_AUC	Spec(0)
0	AdaBoost + Under	0.7058	0.8015	0.7438	0.7716	0.7638	0.6294
1	Gradient Boosting + Under	0.7078	0.8191	0.7221	0.7675	0.7746	0.6791
2	Random Forest + Under (weighted)	0.6900	0.8071	0.7042	0.7521	0.7495	0.6613
3	Decision Tree + Under (weighted)	0.6201	0.7542	0.6398	0.6923	0.6101	0.5804
4	Logistic + Under (weighted)	0.6091	0.8845	0.4771	0.6198	0.7661	0.8747

	Threshold	Accuracy	Precision(1)	Recall(1)	F1(1)	Spec(0)	Score
0	0.745	0.656986	0.858131	0.582844	0.694192	0.806147	0.738974
1	0.740	0.658556	0.854949	0.588719	0.697286	0.799054	0.737993
2	0.750	0.652473	0.860168	0.572855	0.687709	0.812648	0.737685
3	0.735	0.660911	0.852101	0.595770	0.701245	0.791962	0.737532
4	0.765	0.639521	0.873273	0.538484	0.666182	0.842790	0.736825
5	0.715	0.674254	0.840891	0.631904	0.721570	0.759456	0.736724
6	0.710	0.678768	0.837859	0.643655	0.728028	0.749409	0.736580
7	0.730	0.662677	0.848861	0.602233	0.704588	0.784279	0.736465
8	0.760	0.642857	0.868030	0.548766	0.672426	0.832151	0.736316
9	0.720	0.669545	0.843176	0.620740	0.715059	0.767730	0.736128

	threshold	accuracy	precision(1)	recall(1)	f1(1)	spec(0)	cert_ok%	denied_ok%	tn	fp	fn	tp
0	0.43	0.7370	0.7443	0.9236	0.8243	0.3617	0.9236	0.3617	612	1080	260	3144
1	0.50	0.7429	0.7734	0.8702	0.8189	0.4870	0.8702	0.4870	824	868	442	2962
2	0.60	0.7341	0.7958	0.8096	0.8027	0.5822	0.8096	0.5822	985	707	648	2756
3	0.65	0.7157	0.8075	0.7541	0.7799	0.6383	0.7541	0.6383	1080	612	837	2567
4	0.68	0.7023	0.8228	0.7065	0.7602	0.6939	0.7065	0.6939	1174	518	999	2405
5	0.70	0.6880	0.8301	0.6701	0.7415	0.7240	0.6701	0.7240	1225	467	1123	2281

	Point	Threshold	Accuracy	Precision(1)	Recall(1) (Certified)	Specificity(0) (Denied)	F1(1)	Pred Certified (count)	Pred Denied (count)	Certified correct %	Denied correct %
0	Default (0.50)	0.50	0.7457	0.7788	0.8648	0.5062	0.8195	3779	1317	86.48%	50.62%
1	Best F1 (Certified)	0.48	0.7471	0.7755	0.8742	0.4914	0.8219	3836	1260	87.42%	49.14%
2	Balanced Predicted Counts	0.71	0.6754	0.8401	0.6347	0.7572	0.7231	2571	2525	63.47%	75.72%
3	Parity of Correctness (Recall≈Specificity)	0.68	0.7013	0.8235	0.7035	0.6970	0.7588	2907	2189	70.35%	69.70%

	Feature	Importance
0	education_of_employee_HIGH SCHOOL	0.287058
1	has_job_experience_NO	0.097009
2	education_of_employee_BACHELOR'S	0.095770
3	prevailing_wage	0.089537
4	continent_EUROPE	0.063272
5	unit_of_wage_HOUR	0.061306
6	has_job_experience_YES	0.057657
7	no_of_employees	0.032576
8	region_of_employment_MIDWEST	0.032409
9	unit_of_wage_YEAR	0.025646
10	prevailing_wage_yearly	0.023937
11	continent_NORTH AMERICA	0.020471
12	region_of_employment_WEST	0.017753
13	education_of_employee_DOCTORATE	0.015850
14	education_of_employee_MASTER'S	0.014895