#-------------
# ENVIRONMENT CONFIGURATION
#-------------
# pip is used to install pinned dependencies for reproducible execution.
# Version constraints are applied to stabilize model training behavior.

# Installing the libraries with the specified versions
# Suppressing pip dependency resolution errors by redirecting stderr to /dev/null, which is acceptable as library versions are pinned for reproducibility.
!pip install numpy==2.0.2 pandas==2.2.2 scikit-learn==1.6.1 matplotlib==3.10.0 seaborn==0.13.2 joblib==1.4.2 xgboost==2.1.4 requests==2.32.3 huggingface_hub==0.30.1 -q 2>/dev/null

#-------------
# LIBRARY IMPORT
#-------------
# numpy is used for numerical computation and array operations.
# pandas is used for structured data manipulation and analysis.
# matplotlib.pyplot is used to render static visualizations.
# seaborn is used to generate statistical plots and heatmaps.
# sklearn modules are used to construct preprocessing pipelines and regression models.

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# For splitting the dataset
from sklearn.model_selection import train_test_split

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)


# Libraries different ensemble classifiers
from sklearn.ensemble import (
    BaggingRegressor,
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
)
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

# Libraries to get different metric scores
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_absolute_percentage_error
)

# To create the pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline,Pipeline

# To tune different models and standardize
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# To serialize the model
import joblib

# Os related functionalities
import os

# API request
import requests

# For hugging face space authentication to upload files
from huggingface_hub import login, HfApi

import warnings
warnings.filterwarnings("ignore")

#-------------
# DRIVE MOUNT
#-------------

# Uncomment the below snippet of code if the drive needs to be mounted
# From google.colab import drive
# Drive.mount('/content/drive')

#-------------
# DATA INGESTION
#-------------
# pandas.read_csv is used to load the dataset into a DataFrame.
# This DataFrame serves as the canonical source for all downstream analysis.

kart = pd.read_csv("https://raw.githubusercontent.com/EvagAIML/020_Model_Deployment/main/RetailPrediction%20copy.csv")

#-------------
# DATA CHECKPOINT
#-------------
# DataFrame.copy is used to create an isolated working dataset.
# This prevents mutation of the original raw dataset.

# Copying data to another variable to avoid any changes to original data
data = kart.copy()

#-------------
# DATA SAMPLE INSPECTION
#-------------
# DataFrame.head is used to inspect initial records for schema validation.
# This confirms column structure and representative values.

data.head()

#-------------
# DATA SAMPLE INSPECTION (TAIL)
#-------------
# DataFrame.tail is used to inspect final records for structural consistency.
# This verifies dataset completeness and absence of truncation artifacts.

data.tail()

#-------------
# DATASET DIMENSIONS
#-------------
# DataFrame.shape is used to confirm row and column counts.
# This validates dataset scale prior to modeling.

print(f"There are {data.shape[0]} rows and {data.shape[1]} columns.")

There are 8763 rows and 12 columns.

#-------------
# SCHEMA AND DTYPES
#-------------
# DataFrame.info is used to verify data types and non-null counts.
# This confirms feature readiness before preprocessing.

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Product_Id                 8763 non-null   object 
 1   Product_Weight             8763 non-null   float64
 2   Product_Sugar_Content      8763 non-null   object 
 3   Product_Allocated_Area     8763 non-null   float64
 4   Product_Type               8763 non-null   object 
 5   Product_MRP                8763 non-null   float64
 6   Store_Id                   8763 non-null   object 
 7   Store_Establishment_Year   8763 non-null   int64  
 8   Store_Size                 8763 non-null   object 
 9   Store_Location_City_Type   8763 non-null   object 
 10  Store_Type                 8763 non-null   object 
 11  Product_Store_Sales_Total  8763 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 821.7+ KB

#-------------
# DESCRIPTIVE STATISTICS
#-------------
# DataFrame.describe is used to compute summary statistics for numeric features.
# This supports dispersion and range analysis prior to modeling.

data.describe(include="all").T

#-------------
# DUPLICATE RECORD VALIDATION
#-------------
# DataFrame.duplicated is used to identify repeated rows.
# Series.sum is used to quantify duplicate records.

# Checking for duplicate values
data.duplicated().sum()

np.int64(0)

#-------------
# MISSING VALUE ASSESSMENT
#-------------
# DataFrame.isnull is used to detect missing values.
# Aggregation via sum quantifies null counts per column.

# Checking for missing values in the data
data.isnull().sum()

#-------------
# UTILITY: HISTOGRAM BOXPLOT
#-------------

# Function to plot a boxplot and a histogram along the same scale.

def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

#-------------
# DISTRIBUTION: PRODUCT WEIGHT
#-------------

histogram_boxplot(data, "Product_Weight")

#-------------
# DISTRIBUTION: PRODUCT ALLOCATED AREA
#-------------

histogram_boxplot(data, "Product_Allocated_Area")

#-------------
# DISTRIBUTION: PRODUCT MRP
#-------------

histogram_boxplot(data, "Product_MRP")

#-------------
# DISTRIBUTION: PRODUCT STORE SALES
#-------------

histogram_boxplot(data, "Product_Store_Sales_Total")

#-------------
# UTILITY: LABELED BARPLOT
#-------------

# Function to create labeled barplots


def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

#-------------
# FREQUENCY ANALYSIS: PRODUCT_SUGAR_CONTENT
#-------------

labeled_barplot(data, "Product_Sugar_Content", perc=True)

#-------------
# FREQUENCY ANALYSIS: PRODUCT_TYPE
#-------------

labeled_barplot(data, "Product_Type", perc=True, n=10)

#-------------
# FREQUENCY ANALYSIS: STORE_ID
#-------------

labeled_barplot(data, "Store_Id", perc=True)

#-------------
# FREQUENCY ANALYSIS: STORE_SIZE
#-------------

labeled_barplot(data, "Store_Size", perc=True)

#-------------
# FREQUENCY ANALYSIS: STORE_LOCATION_CITY_TYPE
#-------------

labeled_barplot(data, "Store_Location_City_Type", perc=True)

#-------------
# FREQUENCY ANALYSIS: STORE_TYPE
#-------------

labeled_barplot(data, "Store_Type", perc=True)

#-------------
# CORRELATION MATRIX
#-------------
# DataFrame.corr is used to compute pairwise correlations among numeric variables.
# seaborn.heatmap is used to visualize correlation strength and direction.

cols_list = data.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(10, 5))
sns.heatmap(
    data[cols_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
)
plt.show()

#-------------
# BIVARIATE ANALYSIS
#-------------
# seaborn.scatterplot is used to visualize bivariate feature relationships.
# This supports inspection of linearity and variance structure.

plt.figure(figsize=[8, 6])
sns.scatterplot(x=data.Product_Weight, y=data.Product_Store_Sales_Total)
plt.show()

#-------------
# BIVARIATE ANALYSIS
#-------------
# seaborn.scatterplot is used to visualize bivariate feature relationships.
# This supports inspection of linearity and variance structure.

plt.figure(figsize=[8, 6])
sns.scatterplot(x=data.Product_Allocated_Area, y=data.Product_Store_Sales_Total)
plt.show()

#-------------
# BIVARIATE ANALYSIS
#-------------
# seaborn.scatterplot is used to visualize bivariate feature relationships.
# This supports inspection of linearity and variance structure.

plt.figure(figsize=[8, 6])
sns.scatterplot(x=data.Product_MRP, y=data.Product_Store_Sales_Total)
plt.show()

#-------------
# REVENUE AGGREGATION
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_revenue1 = data.groupby(["Product_Type"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize=[14, 8])
plt.xticks(rotation=90)
a = sns.barplot(x=df_revenue1.Product_Type, y=df_revenue1.Product_Store_Sales_Total)
a.set_xlabel("Product Types")
a.set_ylabel("Revenue")
plt.show()

#-------------
# REVENUE AGGREGATION
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_revenue2 = data.groupby(["Product_Sugar_Content"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize=[8, 6])
plt.xticks(rotation=90)
b = sns.barplot(
    x=df_revenue2.Product_Sugar_Content, y=df_revenue2.Product_Store_Sales_Total
)
b.set_xlabel("Product_Sugar_content")
b.set_ylabel("Revenue")
plt.show()

#-------------
# ANALYTICAL STEP
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_store_revenue = data.groupby(["Store_Id"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize=[8, 6])
plt.xticks(rotation=90)
r = sns.barplot(
    x=df_store_revenue.Store_Id, y=df_store_revenue.Product_Store_Sales_Total
)
r.set_xlabel("Stores")
r.set_ylabel("Revenue")
plt.show()

#-------------
# REVENUE AGGREGATION
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_revenue3 = data.groupby(["Store_Size"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize=[8, 6])
plt.xticks(rotation=90)
c = sns.barplot(x=df_revenue3.Store_Size, y=df_revenue3.Product_Store_Sales_Total)
c.set_xlabel("Store_Size")
c.set_ylabel("Revenue")
plt.show()

#-------------
# REVENUE AGGREGATION
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_revenue4 = data.groupby(["Store_Location_City_Type"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize=[8, 6])
plt.xticks(rotation=90)
d = sns.barplot(
    x=df_revenue4.Store_Location_City_Type, y=df_revenue4.Product_Store_Sales_Total
)
d.set_xlabel("Store_Location_City_Type")
d.set_ylabel("Revenue")
plt.show()

#-------------
# REVENUE AGGREGATION
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_revenue5 = data.groupby(["Store_Type"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
plt.figure(figsize=[8, 6])
plt.xticks(rotation=90)
e = sns.barplot(x=df_revenue5.Store_Type, y=df_revenue5.Product_Store_Sales_Total)
e.set_xlabel("Store_Type")
e.set_ylabel("Revenue")
plt.show()

#-------------
# DISTRIBUTION COMPARISON: STORE VS SALES
#-------------

plt.figure(figsize=[14, 8])
sns.boxplot(data=data, x="Store_Id", y="Product_Store_Sales_Total", hue = "Store_Id")
plt.xticks(rotation=90)
plt.title("Boxplot - Store_Id Vs Product_Store_Sales_Total")
plt.xlabel("Stores")
plt.ylabel("Product_Store_Sales_Total (of each product)")
plt.show()

#-------------
# DISTRIBUTION COMPARISON
#-------------

plt.figure(figsize=[14, 8])
sns.boxplot(data = data, x = "Store_Size", y = "Product_Store_Sales_Total", hue = "Store_Size") #Complet the code to plot the boxplot with x as Store_Size , y as Product_Store_Sales_Total and hue as Store_Size
plt.xticks(rotation=90)
plt.title("Boxplot - Store_Size Vs Product_Store_Sales_Total")
plt.xlabel("Stores")
plt.ylabel("Product_Store_Sales_Total (of each product)")
plt.show()

#-------------
# DISTRIBUTION COMPARISON
#-------------

plt.figure(figsize=[14, 8])
sns.boxplot(data = data, x = "Product_Type", y = "Product_Weight", hue = "Product_Type")
plt.xticks(rotation=90)
plt.title("Boxplot - Product_Type Vs Product_Weight")
plt.xlabel("Types of Products")
plt.ylabel("Product_Weight")
plt.show()

#-------------
# DISTRIBUTION COMPARISON
#-------------

plt.figure(figsize=[14, 8])
sns.boxplot(data = data, x = "Product_Sugar_Content", y = "Product_Weight", hue = "Product_Sugar_Content")
plt.xticks(rotation=90)
plt.title("Boxplot - Product_Sugar_Content Vs Product_Weight")
plt.xlabel("Product_Sugar_Content")
plt.ylabel("Product_Weight")
plt.show()

#-------------
# CROSS-TAB HEATMAP: SUGAR CONTENT BY PRODUCT TYPE
#-------------
# pandas.crosstab is used to compute categorical interaction matrices.
# seaborn.heatmap is used to visualize interaction intensity patterns.

plt.figure(figsize=(14, 8))
sns.heatmap(
    pd.crosstab(data["Product_Sugar_Content"], data["Product_Type"]),
    annot=True,
    fmt="g",
    cmap="viridis",
)
plt.ylabel("Product_Sugar_Content")
plt.xlabel("Product_Type")
plt.show()

#-------------
# CROSS-TAB HEATMAP: STORE BY PRODUCT TYPE
#-------------
# pandas.crosstab is used to compute categorical interaction matrices.
# seaborn.heatmap is used to visualize interaction intensity patterns.

plt.figure(figsize=(14, 8))
sns.heatmap(
    pd.crosstab(data["Store_Id"], data["Product_Type"]),
    annot=True,
    fmt="g",
    cmap="viridis",
)
plt.ylabel("Stores")
plt.xlabel("Product_Type")
plt.show()

#-------------
# DISTRIBUTION COMPARISON
#-------------

plt.figure(figsize=[14, 8])
sns.boxplot(data = data, x = "Product_Type", y = "Product_MRP", hue = "Product_Type")
plt.xticks(rotation=90)
plt.title("Boxplot - Product_Type Vs Product_MRP")
plt.xlabel("Product_Type")
plt.ylabel("Product_MRP (of each product)")
plt.show()

#-------------
# DISTRIBUTION COMPARISON
#-------------

plt.figure(figsize=[14, 8])
sns.boxplot(data = data, x = "Store_Id", y = "Product_MRP", hue = "Store_Id")
plt.xticks(rotation=90)
plt.title("Boxplot - Store_Id Vs Product_MRP")
plt.xlabel("Stores")
plt.ylabel("Product_MRP (of each product)")
plt.show()

#-------------
# STORE OUT001: STATISTICAL PROFILE
#-------------
# DataFrame.query is used to subset the dataset for a specific store.
# This enables store-level performance profiling.

data.loc[data["Store_Id"] == "OUT001"].describe(include="all").T

#-------------
# STORE OUT001: TOTAL REVENUE
#-------------
# GroupBy.sum is used to compute total store revenue.
# This supports store performance ranking.

data.loc[data["Store_Id"] == "OUT001", "Product_Store_Sales_Total"].sum()

np.float64(6223113.18)

#-------------
# STORE OUT001: TOTAL REVENUE
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_OUT001 = (
    data.loc[data["Store_Id"] == "OUT001"]
    .groupby(["Product_Type"], as_index=False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize=[14, 8])
plt.xticks(rotation=90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT001")
sns.barplot(x=df_OUT001.Product_Type, y=df_OUT001.Product_Store_Sales_Total)
plt.show()

#-------------
# STORE OUT002: STATISTICAL PROFILE
#-------------
# DataFrame.query is used to subset the dataset for a specific store.
# This enables store-level performance profiling.

data.loc[data["Store_Id"] == "OUT002"].describe(include="all").T

#-------------
# STORE OUT002: TOTAL REVENUE
#-------------
# GroupBy.sum is used to compute total store revenue.
# This supports store performance ranking.

data.loc[data["Store_Id"] == "OUT002", "Product_Store_Sales_Total"].sum()

np.float64(2030909.72)

#-------------
# STORE OUT002: TOTAL REVENUE
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_OUT002 = (
    data.loc[data["Store_Id"] == "OUT002"]
    .groupby(["Product_Type"], as_index=False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize=[14, 8])
plt.xticks(rotation=90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT002")
sns.barplot(x=df_OUT002.Product_Type, y=df_OUT002.Product_Store_Sales_Total)
plt.show()

#-------------
# STORE OUT003: STATISTICAL PROFILE
#-------------
# DataFrame.query is used to subset the dataset for a specific store.
# This enables store-level performance profiling.

data.loc[data["Store_Id"] == "OUT003"].describe(include="all").T

#-------------
# STORE OUT003: TOTAL REVENUE
#-------------
# GroupBy.sum is used to compute total store revenue.
# This supports store performance ranking.

data.loc[data["Store_Id"] == "OUT003", "Product_Store_Sales_Total"].sum()

np.float64(6673457.57)

#-------------
# STORE OUT003: TOTAL REVENUE
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_OUT003 = (
    data.loc[data["Store_Id"] == "OUT003"]
    .groupby(["Product_Type"], as_index=False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize=[14, 8])
plt.xticks(rotation=90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT003")
sns.barplot(x=df_OUT003.Product_Type, y=df_OUT003.Product_Store_Sales_Total)
plt.show()

#-------------
# STORE OUT004: STATISTICAL PROFILE
#-------------
# DataFrame.query is used to subset the dataset for a specific store.
# This enables store-level performance profiling.

data.loc[data["Store_Id"] == "OUT004"].describe(include="all").T

#-------------
# STORE OUT004: TOTAL REVENUE
#-------------
# GroupBy.sum is used to compute total store revenue.
# This supports store performance ranking.

data.loc[data["Store_Id"] == "OUT004", "Product_Store_Sales_Total"].sum()

np.float64(15427583.43)

#-------------
# STORE OUT004: TOTAL REVENUE
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df_OUT004 = (
    data.loc[data["Store_Id"] == "OUT004"]
    .groupby(["Product_Type"], as_index=False)["Product_Store_Sales_Total"]
    .sum()
)
plt.figure(figsize=[14, 8])
plt.xticks(rotation=90)
plt.xlabel("Product_Type")
plt.ylabel("Product_Store_Sales_Total")
plt.title("OUT004")
sns.barplot(x=df_OUT004.Product_Type, y=df_OUT004.Product_Store_Sales_Total)
plt.show()

#-------------
# REVENUE BY PRODUCT TYPE BY STORE
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df1 = data.groupby(["Product_Type", "Store_Id"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
df1

#-------------
# ANALYTICAL STEP
#-------------
# DataFrame.groupby is used to aggregate sales by categorical dimension.
# GroupBy.sum is used to compute total revenue per segment.

df2 = data.groupby(["Product_Sugar_Content", "Store_Id"], as_index=False)[
    "Product_Store_Sales_Total"
].sum()
df2

#-------------
# DATA SAMPLE INSPECTION
#-------------
# DataFrame.head is used to inspect initial records for schema validation.
# This confirms column structure and representative values.

# Standardize Product_Sugar_Content
data['Product_Sugar_Content'] = data['Product_Sugar_Content'].replace({'reg': 'Regular'})

# Extract Product_Id_char
data['Product_Id_char'] = data['Product_Id'].str[:2]

# Calculate Store_Age_Years
data['Store_Age_Years'] = 2025 - data['Store_Establishment_Year']

# Create Product_Type_Category
perishables = ["Dairy", "Meat", "Fruits and Vegetables", "Breakfast", "Breads", "Seafood", "Starchy Foods"]
data['Product_Type_Category'] = data['Product_Type'].apply(lambda x: 'Perishables' if x in perishables else 'Non Perishables')

# Drop ID columns and redundant columns
# Note: We keep Product_MRP, Weight etc. We drop ID and Establishment Year (replaced by Age).
cols_to_drop = ['Product_Id', 'Store_Id', 'Store_Establishment_Year']
data = data.drop(columns=cols_to_drop)

# Verify DataFrame
data.head()

#-------------
# DATASET DIMENSIONS
#-------------
# DataFrame.shape is used to confirm row and column counts.
# This validates dataset scale prior to modeling.

X = data.drop('Product_Store_Sales_Total', axis=1)
y = data['Product_Store_Sales_Total']

# 70/30 Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=True)

print(f"Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")

Train Shape: (6134, 11), Test Shape: (2629, 11)

#-------------
# CORRELATION MATRIX
#-------------
# ColumnTransformer is used to apply preprocessing to numeric and categorical features.
# OneHotEncoder is used to convert categorical variables into model-ready indicators.
# Pipeline is used to chain preprocessing and modeling to prevent data leakage.

from sklearn.impute import SimpleImputer

# Define Preprocessing Pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

print("Preprocessor Defined.")

Preprocessor Defined.

#-------------
# MODEL DEFINITION
#-------------
# DecisionTreeRegressor is used as a baseline non-linear regression model.
# This model captures hierarchical feature splits.

# Define Models and Parameter Grids

models = {
    "Decision Tree": DecisionTreeRegressor(random_state=1),
    "Bagging": BaggingRegressor(random_state=1),
    "Random Forest": RandomForestRegressor(random_state=1),
    "AdaBoost": AdaBoostRegressor(random_state=1),
    "Gradient Boosting": GradientBoostingRegressor(random_state=1),
    "XGBoost": XGBRegressor(random_state=1)
}

param_grids = {
    "Decision Tree": {
        "regressor__max_depth": [3, 5, 7, None],
        "regressor__min_samples_leaf": [1, 5, 10]
    },
    "Bagging": {
        "regressor__n_estimators": [10, 50, 100],
        "regressor__max_samples": [0.5, 0.8, 1.0]
    },
    "Random Forest": {
        "regressor__n_estimators": [50, 100],
        "regressor__max_depth": [5, 10, None],
        "regressor__max_features": ['sqrt', 'log2', None]
    },
    "AdaBoost": {
        "regressor__n_estimators": [50, 100],
        "regressor__learning_rate": [0.01, 0.1, 1.0]
    },
    "Gradient Boosting": {
        "regressor__n_estimators": [50, 100],
        "regressor__learning_rate": [0.01, 0.1],
        "regressor__max_depth": [3, 5]
    },
    "XGBoost": {
        "regressor__n_estimators": [50, 100],
        "regressor__learning_rate": [0.01, 0.1],
        "regressor__max_depth": [3, 5]
    }
}

#-------------
# DATASET DIMENSIONS
#-------------
# DataFrame.shape is used to confirm row and column counts.
# This validates dataset scale prior to modeling.

def evaluate_model(model_name, model, param_grid, X_train, y_train, X_test, y_test):
    print(f"Processing {model_name}...")

# Create Pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])

# 1. Evaluate Base Model
    pipeline.fit(X_train, y_train)
    base_fitted_estimator = pipeline
    base_pred_train = pipeline.predict(X_train)
    base_pred_test = pipeline.predict(X_test)

    def get_metrics(y_true, y_pred, subset):
        return {
            f"{subset}_RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
            f"{subset}_MAE": mean_absolute_error(y_true, y_pred),
            f"{subset}_R2": r2_score(y_true, y_pred),
            f"{subset}_Adj_R2": 1 - (1 - r2_score(y_true, y_pred)) * (len(y_true) - 1) / (len(y_true) - X_train.shape[1] - 1),
            f"{subset}_MAPE": mean_absolute_percentage_error(y_true, y_pred)
        }

    base_metrics_train = get_metrics(y_train, base_pred_train, "Train")
    base_metrics_test = get_metrics(y_test, base_pred_test, "Test")

# 2. Hyperparameter Tuning
    search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring='r2')
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    tuned_fitted_estimator = best_model

    tuned_pred_train = best_model.predict(X_train)
    tuned_pred_test = best_model.predict(X_test)

    tuned_metrics_train = get_metrics(y_train, tuned_pred_train, "Train")
    tuned_metrics_test = get_metrics(y_test, tuned_pred_test, "Test")

    print(f"  Best Params: {search.best_params_}")

    return {
        "Base": {**base_metrics_train, **base_metrics_test, "Fitted Estimator": base_fitted_estimator},
        "Tuned": {**tuned_metrics_train, **tuned_metrics_test, "Best Params": search.best_params_, "Fitted Estimator": tuned_fitted_estimator}
    }

#-------------
# MODEL TRAINING LOOP
#-------------
# mean_squared_error is used to compute RMSE for error magnitude evaluation.
# r2_score is used to measure variance explained by the model.
# MAPE is used to measure relative prediction error in percentage terms.

# Execution Loop
results = {}

for name, model in models.items():
    results[name] = evaluate_model(name, model, param_grids[name], X_train, y_train, X_test, y_test)

Processing Decision Tree...
  Best Params: {'regressor__max_depth': None, 'regressor__min_samples_leaf': 10}
Processing Bagging...
  Best Params: {'regressor__max_samples': 0.5, 'regressor__n_estimators': 100}
Processing Random Forest...
  Best Params: {'regressor__max_depth': 10, 'regressor__max_features': None, 'regressor__n_estimators': 100}
Processing AdaBoost...
  Best Params: {'regressor__learning_rate': 0.01, 'regressor__n_estimators': 50}
Processing Gradient Boosting...
  Best Params: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 50}
Processing XGBoost...
  Best Params: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 50}

#-------------
# PERFORMANCE AGGREGATION
#-------------

train_data = {}
test_data = {}

for model_name, res in results.items():
# Base
    train_data[f"{model_name}"] = {
        "RMSE": res['Base']['Train_RMSE'],
        "MAE": res['Base']['Train_MAE'],
        "R-squared": res['Base']['Train_R2'],
        "Adj. R-squared": res['Base']['Train_Adj_R2'],
        "MAPE": res['Base']['Train_MAPE']
    }
    test_data[f"{model_name}"] = {
        "RMSE": res['Base']['Test_RMSE'],
        "MAE": res['Base']['Test_MAE'],
        "R-squared": res['Base']['Test_R2'],
        "Adj. R-squared": res['Base']['Test_Adj_R2'],
        "MAPE": res['Base']['Test_MAPE']
    }

# Tuned
    train_data[f"{model_name} Tuned"] = {
        "RMSE": res['Tuned']['Train_RMSE'],
        "MAE": res['Tuned']['Train_MAE'],
        "R-squared": res['Tuned']['Train_R2'],
        "Adj. R-squared": res['Tuned']['Train_Adj_R2'],
        "MAPE": res['Tuned']['Train_MAPE']
    }
    test_data[f"{model_name} Tuned"] = {
        "RMSE": res['Tuned']['Test_RMSE'],
        "MAE": res['Tuned']['Test_MAE'],
        "R-squared": res['Tuned']['Test_R2'],
        "Adj. R-squared": res['Tuned']['Test_Adj_R2'],
        "MAPE": res['Tuned']['Test_MAPE']
    }

train_perf_df = pd.DataFrame(train_data)
test_perf_df = pd.DataFrame(test_data)

print("--- Training Performance ---")
display(train_perf_df)
print("\n--- Test Performance ---")
display(test_perf_df)

--- Training Performance ---

--- Test Performance ---

#-------------
# BEST MODEL SELECTION
#-------------

# Find Best Model
best_model_name = test_perf_df.loc['RMSE'].astype(float).idxmin()
best_rmse = test_perf_df.loc['RMSE'].astype(float).min()
best_r2 = test_perf_df.loc['R-squared', best_model_name]

print(f"The Best Performing Model is: {best_model_name}")
print(f"Test RMSE: {best_rmse:.4f}")
print(f"Test R-Squared: {best_r2:.4f}")

if "Tuned" in best_model_name:
    original_name = best_model_name.replace(" Tuned", "")
    print(f"Best Hyperparameters: {results[original_name]['Tuned']['Best Params']}")

The Best Performing Model is: Bagging Tuned
Test RMSE: 290.9812
Test R-Squared: 0.9260
Best Hyperparameters: {'regressor__max_samples': 0.5, 'regressor__n_estimators': 100}

#-------------
# LIBRARY CONFIGURATION
#-------------
# Imports necessary libraries and dependencies.

# -----------------------------
# DETERMINISTIC MODEL SERIALIZATION
# -----------------------------
from pathlib import Path
import joblib
import shutil

# Create deployment_files directory fresh
deployment_dir = Path("deployment_files")
if deployment_dir.exists():
    shutil.rmtree(deployment_dir)
deployment_dir.mkdir()

# Extract tuned models
bagging_model = results["Bagging"]["Tuned"]["Fitted Estimator"]
rf_model = results["Random Forest"]["Tuned"]["Fitted Estimator"]

# Define artifact paths
BAGGING_ARTIFACT_PATH = deployment_dir / "bagging_tuned.joblib"
RF_ARTIFACT_PATH = deployment_dir / "random_forest_tuned.joblib"

# Save artifacts
joblib.dump(bagging_model, BAGGING_ARTIFACT_PATH)
joblib.dump(rf_model, RF_ARTIFACT_PATH)

print("Artifacts saved:")
print(" -", BAGGING_ARTIFACT_PATH)
print(" -", RF_ARTIFACT_PATH)

Artifacts saved:
 - deployment_files/bagging_tuned.joblib
 - deployment_files/random_forest_tuned.joblib

#-------------
# LIBRARY CONFIGURATION
#-------------
# Imports necessary libraries and dependencies.


# 4. Copy serialized artifacts into backend root

from pathlib import Path
backend_root = Path("backend_space_root")
backend_root.mkdir(parents=True, exist_ok=True)

if not BAGGING_ARTIFACT_PATH.exists():
    raise FileNotFoundError("Missing bagging_tuned.joblib artifact")

if not RF_ARTIFACT_PATH.exists():
    raise FileNotFoundError("Missing random_forest_tuned.joblib artifact")

shutil.copy(BAGGING_ARTIFACT_PATH, backend_root / "bagging_tuned.joblib")
shutil.copy(RF_ARTIFACT_PATH, backend_root / "random_forest_tuned.joblib")

print("Model artifacts copied to backend_space_root/")

Model artifacts copied to backend_space_root/

#-------------
# CODE EXECUTION
#-------------
# Executes the defined logic.

import os

#-------------
# LIBRARY CONFIGURATION
#-------------
# Imports necessary libraries and dependencies.

from pathlib import Path
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

print("Checking backend_space_root...")
root = Path("backend_space_root")
files = [f.name for f in root.iterdir()]
print("Files found:", files)

# Create Dockerfile
dockerfile_content = """FROM python:3.12-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY . .

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
"""
(root / "Dockerfile").write_text(dockerfile_content)

# Create requirements.txt
# Including all libraries used in the notebook to ensure compatibility and avoid missing dependency errors
requirements_content = """fastapi==0.111.0
uvicorn==0.30.1
scikit-learn==1.6.1
joblib==1.4.2
pandas==2.2.2
numpy==2.0.2
xgboost==2.1.4
huggingface_hub==0.30.1
streamlit==1.43.2
matplotlib==3.10.0
seaborn==0.13.2
requests==2.32.3
"""
(root / "requirements.txt").write_text(requirements_content)

# Create app.py for FastAPI backend
app_content = """from fastapi import FastAPI, HTTPException
from typing import List, Dict, Any
import joblib
import pandas as pd
import numpy as np
import os

app = FastAPI()

# Load the models
MODELS = {
    "primary model - bg": joblib.load("bagging_tuned.joblib"),
    "secondary model - rf": joblib.load("random_forest_tuned.joblib"),
}

@app.get("/")
def read_root():
    return {"status": "healthy", "message": "Retail Prediction Backend is running. Use /v1/predict for predictions."}

@app.get("/health")
def health_check():
    return {"status": "healthy"}

@app.post("/v1/predict")
async def predict(request_body: Dict[str, Any]):
    model_name = request_body.get("model", "primary model - bg")
    rows = request_body.get("rows", [])

    if not rows:
        raise HTTPException(status_code=400, detail="No data rows provided for prediction.")

    model = MODELS.get(model_name)
    if not model:
        raise HTTPException(status_code=404, detail=f"Model '{model_name}' not found.")

    try:
        df = pd.DataFrame(rows)
        predictions = model.predict(df).tolist()

        # Calculate overall total
        overall_total = sum(predictions)

        # Calculate store-level totals if 'Store_Id' is present in the input data
        store_totals = {}
        if 'Store_Id' in df.columns:
            df['predictions'] = predictions
            store_totals = df.groupby('Store_Id')['predictions'].sum().to_dict()

        return {"model_used": model_name, "predictions": predictions, "overall_total": overall_total, "store_totals": store_totals}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
"""
(root / "app.py").write_text(app_content)

assert "Dockerfile" in files or (root / "Dockerfile").exists()
assert "requirements.txt" in files or (root / "requirements.txt").exists()
assert "app.py" in files or (root / "app.py").exists()
assert "bagging_tuned.joblib" in files
assert "random_forest_tuned.joblib" in files

reqs = (root / "requirements.txt").read_text()
print("Backend validation PASSED")

Checking backend_space_root...
Files found: ['app.py', 'requirements.txt', 'bagging_tuned.joblib', 'random_forest_tuned.joblib', 'Dockerfile']
Backend validation PASSED

#-------------
# LIBRARY CONFIGURATION
#-------------
# Imports necessary libraries and dependencies.

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#-------------
# LIBRARY CONFIGURATION
#-------------
# Imports necessary libraries and dependencies.

from huggingface_hub import upload_folder
upload_folder(
    repo_id="EvagAIML/RetailPrediction001Backend",
    folder_path="backend_space_root",
    path_in_repo=".",
    repo_type="space",
    commit_message="v23 deploy backend (Docker Python 3.12)",
)

Uploading...:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
WARNING:huggingface_hub.hf_api:No files have been modified since last commit. Skipping to prevent empty commit.

CommitInfo(commit_url='https://huggingface.co/spaces/EvagAIML/RetailPrediction001Backend/commit/be1bd444b2dfbb8cec77ccab9853ea54a772d0c5', commit_message='v23 deploy backend (Docker Python 3.12)', commit_description='', oid='be1bd444b2dfbb8cec77ccab9853ea54a772d0c5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/EvagAIML/RetailPrediction001Backend', endpoint='https://huggingface.co', repo_type='space', repo_id='EvagAIML/RetailPrediction001Backend'), pr_revision=None, pr_num=None)

#-------------
# LIBRARY CONFIGURATION
#-------------
# Imports necessary libraries and dependencies.

import os
import shutil
from pathlib import Path

# Create frontend root
frontend_root = Path("frontend_space_root")
if frontend_root.exists():
    shutil.rmtree(frontend_root)
frontend_root.mkdir()

# Create src
(frontend_root / "src").mkdir()

# 1. requirements.txt
req_content = """streamlit==1.43.2
requests==2.32.3
"""
(frontend_root / "requirements.txt").write_text(req_content)

# 2. src/streamlit_app.py
app_content = r"""import csv
import io
from typing import Any, Dict, List

import requests
import streamlit as st

BACKEND_URL = "https://EvagAIML-RetailPrediction001Backend.hf.space/v1/predict"

NUMERIC_FIELDS = {
    "Product_Weight": float,
    "Product_Allocated_Area": float,
    "Product_MRP": float,
    "Store_Age_Years": float,
}

# Mapping: Friendly Name -> Original API Name
FIELD_MAPPING = {
    "Product Weight": "Product_Weight",
    "Sugar Content": "Product_Sugar_Content",
    "Product Allocated Area": "Product_Allocated_Area",
    "Product MRP": "Product_MRP",
    "Store Size": "Store_Size",
    "City Tier": "Store_Location_City_Type",
    "Store Type": "Store_Type",
    "Product Category ID": "Product_Id_char",
    "Store Age": "Store_Age_Years",
    "Product Type": "Product_Type",
    # Derived: "Product_Type_Category"
}

def _coerce_row_types(row: Dict[str, Any]) -> Dict[str, Any]:
    out: Dict[str, Any] = {}
    for k, v in row.items():
        if v is None:
            continue
        if isinstance(v, str):
            v = v.strip()
        if v == "":
            continue

        if k in NUMERIC_FIELDS:
            try:
                out[k] = NUMERIC_FIELDS[k](v)
            except Exception:
                out[k] = v
        else:
            out[k] = v
    return out

def call_backend(model_name: str, rows: List[Dict[str, Any]], timeout_s: int = 60) -> Dict[str, Any]:
    payload = {"model": model_name, "rows": rows}
    resp = requests.post(BACKEND_URL, json=payload, timeout=timeout_s)
    if resp.status_code != 200:
        raise RuntimeError(f"Backend error ({resp.status_code}): {resp.text}")
    return resp.json()

st.set_page_config(page_title="Retail Revenue Forecast", layout="centered")
st.title("Retail Revenue Forecast")
st.caption("Thin Streamlit UI calling the backend API. Supports runtime model switching + batch CSV scoring.")

st.divider()
st.subheader("Administrator Controls")

model_choice = st.selectbox(
    "Select Forecasting Model",
    ["primary model - bg", "secondary model - rf"],
    index=0,
    help="Matches backend model keys.",
    key="model_choice"
)

show_raw = st.checkbox("Show raw JSON response", value=False, key="show_raw")

st.divider()
st.subheader("Single Prediction")

# Store ID
st.markdown("**Store ID**")
st.caption("Store_Id (Optional)")
st.caption("Unique identifier for the store. Enables aggregation of totals.")
store_id = st.text_input("", value="", label_visibility="collapsed", key="store_id").strip()

# Product Weight
st.markdown("**Product Weight**")
st.caption("Product_Weight")
st.caption("Weight of the product.")
Product_Weight = st.number_input("", min_value=0.0, value=12.66, step=0.01, label_visibility="collapsed", key="Product_Weight")

# Product Allocated Area
st.markdown("**Product Allocated Area**")
st.caption("Product_Allocated_Area")
st.caption("Percentage of display area allocated to this product in the store.")
Product_Allocated_Area = st.number_input("", min_value=0.0, value=0.0, step=0.01, label_visibility="collapsed", key="Product_Allocated_Area")

# Product MRP
st.markdown("**Product MRP**")
st.caption("Product_MRP")
st.caption("Maximum Retail Price (list price) of the product.")
Product_MRP = st.number_input("", min_value=0.0, value=0.0, step=0.01, label_visibility="collapsed", key="Product_MRP")

# Store Age
st.markdown("**Store Age**")
st.caption("Store_Age_Years")
st.caption("Age of the store in years (calculated from establishment year).")
Store_Age_Years = st.number_input("", min_value=0.0, value=0.0, step=1.0, label_visibility="collapsed", key="Store_Age_Years")

# Sugar Content
st.markdown("**Sugar Content**")
st.caption("Product_Sugar_Content")
st.caption("Sugar content category.")
Product_Sugar_Content = st.selectbox("", ["Low Sugar", "Regular", "No Sugar"], index=1, label_visibility="collapsed", key="Product_Sugar_Content")

# Store Size
st.markdown("**Store Size**")
st.caption("Store_Size")
st.caption("Physical size of the store.")
Store_Size = st.selectbox("", ["Small", "Medium", "High"], index=1, label_visibility="collapsed", key="Store_Size")

# City Tier
st.markdown("**City Tier**")
st.caption("Store_Location_City_Type")
st.caption("Tier of the city based on urbanization and population density (Tier 1 = High, Tier 2 = Medium, Tier 3 = Low).")
Store_Location_City_Type = st.selectbox("", ["Tier 1", "Tier 2", "Tier 3"], index=1, label_visibility="collapsed", key="Store_Location_City_Type")

# Store Type
st.markdown("**Store Type**")
st.caption("Store_Type")
st.caption("Category of the store.")
Store_Type = st.selectbox(
    "",
    ["Grocery Store", "Supermarket Type1", "Supermarket Type2", "Departmental Store", "Food Mart"],
    index=0,
    label_visibility="collapsed",
    key="Store_Type"
)

# Product Category ID
st.markdown("**Product Category ID**")
st.caption("Product_Id_char")
st.caption("First two characters of the ID (FD=Food, NC=Non-Consumable, DR=Drink).")
Product_Id_char = st.selectbox("", ["FD", "NC"], index=0, label_visibility="collapsed", key="Product_Id_char")

# Product Type
PRODUCT_TYPES = [
    "Baking Goods", "Breads", "Breakfast", "Canned", "Dairy", "Frozen Foods",
    "Fruits and Vegetables", "Hard Drinks", "Health and Hygiene", "Household",
    "Meat", "Others", "Seafood", "Snack Foods", "Soft Drinks", "Starchy Foods"
]
PERISHABLES = ["Dairy", "Meat", "Fruits and Vegetables", "Breakfast", "Breads", "Seafood", "Starchy Foods"]

st.markdown("**Product Type**")
st.caption("Product_Type")
st.caption("Specific category of the product.")
Product_Type = st.selectbox("", PRODUCT_TYPES, index=4, label_visibility="collapsed", key="Product_Type")

# Derive Category
if Product_Type in PERISHABLES:
    Product_Type_Category = "Perishables"
else:
    Product_Type_Category = "Non Perishables"

row = {
    "Product_Weight": Product_Weight,
    "Product_Sugar_Content": Product_Sugar_Content,
    "Product_Allocated_Area": Product_Allocated_Area,
    "Product_MRP": Product_MRP,
    "Store_Size": Store_Size,
    "Store_Location_City_Type": Store_Location_City_Type,
    "Store_Type": Store_Type,
    "Product_Id_char": Product_Id_char,
    "Store_Age_Years": Store_Age_Years,
    "Product_Type": Product_Type,
    "Product_Type_Category": Product_Type_Category,
}
if store_id:
    row["Store_Id"] = store_id

if st.button("Predict", type="primary", key="predict_single"):
    try:
        result = call_backend(model_choice, [_coerce_row_types(row)], timeout_s=30)
        st.success(f"Model used: {result.get('model_used', model_choice)}")

        preds = result.get("predictions", [])
        if preds:
            st.metric("Predicted revenue (this row)", f"{float(preds[0]):,.2f}")

        overall_total = result.get("overall_total", None)
        if overall_total is not None:
            st.metric("Overall Total Revenue", f"{float(overall_total):,.2f}")

        store_totals = result.get("store_totals", {})
        if store_totals:
            st.write("Store Totals:", store_totals)

        if show_raw:
            st.json(result)

    except Exception as e:
        st.error(f"Error: {e}")

st.divider()
st.subheader("Batch Prediction (CSV Upload)")
st.caption("Upload a CSV with one row per record. Column names must match the model’s expected feature names.")

uploaded = st.file_uploader("Upload CSV", type=["csv"], key="csv_uploader")
if uploaded is not None:
    try:
        raw = uploaded.getvalue().decode("utf-8", errors="replace")
        reader = csv.DictReader(io.StringIO(raw))
        rows_in = [r for r in reader]

        if not rows_in:
            st.warning("No rows found in CSV.")
        else:
            st.write("Detected columns:", reader.fieldnames)
            st.write("Preview (first 3 rows):")
            st.json(rows_in[:3])

            if st.button("Predict Batch", key="predict_batch"):
                res = call_backend(model_choice, [ _coerce_row_types(r) for r in rows_in ], timeout_s=60)
                st.success(f"Batch prediction complete (Model: {res.get('model_used', 'unknown')})")

                preds = res.get("predictions", [])
                if preds:
                    df_res = pd.DataFrame(rows_in)
                    df_res["Predicted_Revenue"] = preds
                    st.dataframe(df_res)

                    # Convert to CSV for download
                    csv_buffer = io.StringIO()
                    df_res.to_csv(csv_buffer, index=False)
                    st.download_button(
                        label="Download Predictions as CSV",
                        data=csv_buffer.getvalue(),
                        file_name="predictions.csv",
                        mime="text/csv",
                        key="download_csv"
                    )

                overall_total = res.get("overall_total", None)
                if overall_total is not None:
                    st.metric("Batch Total Revenue", f"{float(overall_total):,.2f}")

                store_totals = res.get("store_totals", {})
                if store_totals:
                    st.write("Batch Store Totals:", store_totals)

    except Exception as e:
        st.error(f"Error processing batch: {e}")
"""
(frontend_root / "src" / "streamlit_app.py").write_text(app_content)

9169

#-------------
# LIBRARY CONFIGURATION
#-------------
# Imports necessary libraries and dependencies.

from pathlib import Path

print("Checking frontend_space_root...")
root = Path("frontend_space_root")
files = [f.name for f in root.iterdir()]
print("Root files:", files)

src_files = [f.name for f in (root / "src").iterdir()] if (root/"src").exists() else []
print("Src files:", src_files)

assert "streamlit_app.py" in src_files
assert (root / "src" / "streamlit_app.py").exists()

reqs = (root / "requirements.txt").read_text()
assert "pandas" not in reqs
print("Frontend validation PASSED")

Checking frontend_space_root...
Root files: ['requirements.txt', 'src']
Src files: ['streamlit_app.py']
Frontend validation PASSED

#-------------
# CODE EXECUTION
#-------------
# Executes the defined logic.

upload_folder(
    repo_id="EvagAIML/RetailPrediction001frontend",
    folder_path="frontend_space_root",
    path_in_repo=".",
    repo_type="space",
    commit_message="v23 deploy frontend (Docker Streamlit template)"
)

No files have been modified since last commit. Skipping to prevent empty commit.
WARNING:huggingface_hub.hf_api:No files have been modified since last commit. Skipping to prevent empty commit.

CommitInfo(commit_url='https://huggingface.co/spaces/EvagAIML/RetailPrediction001frontend/commit/ab1c39f8063a73c5bf3d88746882d7da008f8279', commit_message='v23 deploy frontend (Docker Streamlit template)', commit_description='', oid='ab1c39f8063a73c5bf3d88746882d7da008f8279', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/EvagAIML/RetailPrediction001frontend', endpoint='https://huggingface.co', repo_type='space', repo_id='EvagAIML/RetailPrediction001frontend'), pr_revision=None, pr_num=None)

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	8763	8763	FD306	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	8763.0	NaN	NaN	NaN	12.653792	2.21732	4.0	11.15	12.66	14.18	22.0
Product_Sugar_Content	8763	4	Low Sugar	4885	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	8763.0	NaN	NaN	NaN	0.068786	0.048204	0.004	0.031	0.056	0.096	0.298
Product_Type	8763	16	Fruits and Vegetables	1249	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	8763.0	NaN	NaN	NaN	147.032539	30.69411	31.0	126.16	146.74	167.585	266.0
Store_Id	8763	4	OUT004	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	8763.0	NaN	NaN	NaN	2002.032751	8.388381	1987.0	1998.0	2009.0	2009.0	2009.0
Store_Size	8763	3	Medium	6025	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	8763	3	Tier 2	6262	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	8763	4	Supermarket Type2	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	8763.0	NaN	NaN	NaN	3464.00364	1065.630494	33.0	2761.715	3452.34	4145.165	8000.0

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	1586	1586	NC7187	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	1586.0	NaN	NaN	NaN	13.458865	2.064975	6.16	12.0525	13.96	14.95	17.97
Product_Sugar_Content	1586	4	Low Sugar	845	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1586.0	NaN	NaN	NaN	0.068768	0.047131	0.004	0.033	0.0565	0.094	0.295
Product_Type	1586	16	Snack Foods	202	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1586.0	NaN	NaN	NaN	160.514054	30.359059	71.35	141.72	168.32	182.9375	226.59
Store_Id	1586	1	OUT001	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1586.0	NaN	NaN	NaN	1987.0	0.0	1987.0	1987.0	1987.0	1987.0	1987.0
Store_Size	1586	1	High	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1586	1	Tier 2	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1586	1	Supermarket Type1	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1586.0	NaN	NaN	NaN	3923.778802	904.62901	2300.56	3285.51	4139.645	4639.4	4997.63

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	1152	1152	NC2769	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	1152.0	NaN	NaN	NaN	9.911241	1.799846	4.0	8.7675	9.795	10.89	19.82
Product_Sugar_Content	1152	4	Low Sugar	658	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1152.0	NaN	NaN	NaN	0.067747	0.047567	0.006	0.031	0.0545	0.09525	0.292
Product_Type	1152	16	Fruits and Vegetables	168	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1152.0	NaN	NaN	NaN	107.080634	24.912333	31.0	92.8275	104.675	117.8175	224.93
Store_Id	1152	1	OUT002	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1152.0	NaN	NaN	NaN	1998.0	0.0	1998.0	1998.0	1998.0	1998.0	1998.0
Store_Size	1152	1	Small	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1152	1	Tier 3	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1152	1	Food Mart	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1152.0	NaN	NaN	NaN	1762.942465	462.862431	33.0	1495.4725	1889.495	2133.6225	2299.63

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	1349	1349	NC522	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	1349.0	NaN	NaN	NaN	15.103692	1.893531	7.35	14.02	15.18	16.35	22.0
Product_Sugar_Content	1349	4	Low Sugar	750	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1349.0	NaN	NaN	NaN	0.068637	0.048708	0.004	0.031	0.057	0.094	0.298
Product_Type	1349	16	Snack Foods	186	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1349.0	NaN	NaN	NaN	181.358725	24.796429	85.88	166.92	179.67	198.07	266.0
Store_Id	1349	1	OUT003	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1349.0	NaN	NaN	NaN	1999.0	0.0	1999.0	1999.0	1999.0	1999.0	1999.0
Store_Size	1349	1	Medium	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1349	1	Tier 1	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1349	1	Departmental Store	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1349.0	NaN	NaN	NaN	4946.966323	677.539953	3069.24	4355.39	4958.29	5366.59	8000.0

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	4676	4676	NC584	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	4676.0	NaN	NaN	NaN	12.349613	1.428199	7.34	11.37	12.37	13.3025	17.79
Product_Sugar_Content	4676	4	Low Sugar	2632	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	4676.0	NaN	NaN	NaN	0.069092	0.048584	0.004	0.031	0.056	0.097	0.297
Product_Type	4676	16	Fruits and Vegetables	700	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	4676.0	NaN	NaN	NaN	142.399709	17.513973	83.04	130.54	142.82	154.1925	197.66
Store_Id	4676	1	OUT004	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	4676.0	NaN	NaN	NaN	2009.0	0.0	2009.0	2009.0	2009.0	2009.0	2009.0
Store_Size	4676	1	Medium	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	4676	1	Tier 2	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	4676	1	Supermarket Type2	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	4676.0	NaN	NaN	NaN	3299.312111	468.271692	1561.06	2942.085	3304.18	3646.9075	5462.86

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
0	FD6114	12.66	Low Sugar	0.027	Frozen Foods	117.08	OUT004	2009	Medium	Tier 2	Supermarket Type2	2842.40
1	FD7839	16.54	Low Sugar	0.144	Dairy	171.43	OUT003	1999	Medium	Tier 1	Departmental Store	4830.02
2	FD5075	14.28	Regular	0.031	Canned	162.08	OUT001	1987	High	Tier 2	Supermarket Type1	4130.16
3	FD8233	12.10	Low Sugar	0.112	Baking Goods	186.31	OUT001	1987	High	Tier 2	Supermarket Type1	4132.18
4	NC1180	9.57	No Sugar	0.010	Health and Hygiene	123.67	OUT002	1998	Small	Tier 3	Food Mart	2279.36

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
8758	NC7546	14.80	No Sugar	0.016	Health and Hygiene	140.53	OUT004	2009	Medium	Tier 2	Supermarket Type2	3806.53
8759	NC584	14.06	No Sugar	0.142	Household	144.51	OUT004	2009	Medium	Tier 2	Supermarket Type2	5020.74
8760	NC2471	13.48	No Sugar	0.017	Health and Hygiene	88.58	OUT001	1987	High	Tier 2	Supermarket Type1	2443.42
8761	NC7187	13.89	No Sugar	0.193	Household	168.44	OUT001	1987	High	Tier 2	Supermarket Type1	4171.82
8762	FD306	14.73	Low Sugar	0.177	Snack Foods	224.93	OUT002	1998	Small	Tier 3	Food Mart	2186.08

	Product_Type	Store_Id	Product_Store_Sales_Total
0	Baking Goods	OUT001	525131.04
1	Baking Goods	OUT002	169860.50
2	Baking Goods	OUT003	491908.20
3	Baking Goods	OUT004	1266086.26
4	Breads	OUT001	121274.09
5	Breads	OUT002	43419.47
6	Breads	OUT003	175391.93
7	Breads	OUT004	374856.75
8	Breakfast	OUT001	38161.10
9	Breakfast	OUT002	23396.10
10	Breakfast	OUT003	95634.08
11	Breakfast	OUT004	204939.13
12	Canned	OUT001	449016.38
13	Canned	OUT002	151467.66
14	Canned	OUT003	452445.17
15	Canned	OUT004	1247153.50
16	Dairy	OUT001	598767.62
17	Dairy	OUT002	178888.18
18	Dairy	OUT003	715814.94
19	Dairy	OUT004	1318447.30
20	Frozen Foods	OUT001	558556.81
21	Frozen Foods	OUT002	180295.95
22	Frozen Foods	OUT003	597608.42
23	Frozen Foods	OUT004	1473519.65
24	Fruits and Vegetables	OUT001	792992.59
25	Fruits and Vegetables	OUT002	298503.56
26	Fruits and Vegetables	OUT003	897437.46
27	Fruits and Vegetables	OUT004	2311899.66
28	Hard Drinks	OUT001	152920.74
29	Hard Drinks	OUT002	54281.85
30	Hard Drinks	OUT003	110760.30
31	Hard Drinks	OUT004	307851.73
32	Health and Hygiene	OUT001	435005.31
33	Health and Hygiene	OUT002	164660.81
34	Health and Hygiene	OUT003	439139.18
35	Health and Hygiene	OUT004	1124901.91
36	Household	OUT001	531371.38
37	Household	OUT002	184665.65
38	Household	OUT003	523981.64
39	Household	OUT004	1324721.50
40	Meat	OUT001	505867.28
41	Meat	OUT002	151800.01
42	Meat	OUT003	520939.68
43	Meat	OUT004	950604.97
44	Others	OUT001	123977.09
45	Others	OUT002	32835.73
46	Others	OUT003	159963.75
47	Others	OUT004	224719.73
48	Seafood	OUT001	52936.84
49	Seafood	OUT002	17663.35
50	Seafood	OUT003	65337.48
51	Seafood	OUT004	136466.37
52	Snack Foods	OUT001	806142.24
53	Snack Foods	OUT002	255317.57
54	Snack Foods	OUT003	918510.44
55	Snack Foods	OUT004	2009026.70
56	Soft Drinks	OUT001	410548.69
57	Soft Drinks	OUT002	103808.35
58	Soft Drinks	OUT003	365046.30
59	Soft Drinks	OUT004	917641.38
60	Starchy Foods	OUT001	120443.98
61	Starchy Foods	OUT002	20044.98
62	Starchy Foods	OUT003	143538.60
63	Starchy Foods	OUT004	234746.89

	Product_Sugar_Content	Store_Id	Product_Store_Sales_Total
0	Low Sugar	OUT001	3300834.93
1	Low Sugar	OUT002	1156758.85
2	Low Sugar	OUT003	3706903.24
3	Low Sugar	OUT004	8658908.78
4	No Sugar	OUT001	1090353.78
5	No Sugar	OUT002	382162.19
6	No Sugar	OUT003	1123084.57
7	No Sugar	OUT004	2674343.14
8	Regular	OUT001	1749444.51
9	Regular	OUT002	472112.50
10	Regular	OUT003	1743566.35
11	Regular	OUT004	3902547.93
12	reg	OUT001	82479.96
13	reg	OUT002	19876.18
14	reg	OUT003	99903.41
15	reg	OUT004	191783.58

	Decision Tree	Decision Tree Tuned	Bagging	Bagging Tuned	Random Forest	Random Forest Tuned	AdaBoost	AdaBoost Tuned	Gradient Boosting	Gradient Boosting Tuned	XGBoost	XGBoost Tuned
RMSE	0.0	249.103400	119.778117	169.973593	103.931866	181.975694	504.504577	453.161495	288.444908	239.678859	117.470972	251.423122
MAE	0.0	110.010911	45.707207	65.184170	39.974872	75.684495	397.925008	349.535101	137.727284	104.926863	57.654829	106.851265
R-squared	1.0	0.945159	0.987320	0.974466	0.990453	0.970733	0.775054	0.818510	0.926468	0.949230	0.987804	0.944133
Adj. R-squared	1.0	0.945060	0.987298	0.974421	0.990436	0.970681	0.774650	0.818183	0.926336	0.949139	0.987782	0.944032
MAPE	0.0	0.038898	0.016325	0.023321	0.014194	0.025373	0.134979	0.119579	0.050006	0.037268	0.019484	0.038685

	Decision Tree	Decision Tree Tuned	Bagging	Bagging Tuned	Random Forest	Random Forest Tuned	AdaBoost	AdaBoost Tuned	Gradient Boosting	Gradient Boosting Tuned	XGBoost	XGBoost Tuned
RMSE	366.901777	322.756080	307.281111	290.981233	293.438586	292.350834	522.492268	467.605168	321.434073	301.646011	306.322052	299.909378
MAE	125.201617	143.142376	120.643322	110.480267	111.054875	116.952349	411.818592	356.914247	152.941935	131.054800	136.206914	127.859985
R-squared	0.882364	0.908969	0.917489	0.926010	0.924755	0.925312	0.761439	0.808927	0.909713	0.920487	0.918003	0.921400
Adj. R-squared	0.881870	0.908586	0.917142	0.925699	0.924439	0.924998	0.760436	0.808124	0.909334	0.920153	0.917659	0.921070
MAPE	0.054144	0.062199	0.051935	0.050329	0.049934	0.052058	0.158873	0.136739	0.066381	0.058430	0.056372	0.057209

Retail_Prediction_EV_v26¶

Executive Summary¶

Deployment Section¶

Problem Statement¶

Business Context¶

Objective¶

Data¶

Environment setup¶

Environment Configuration¶

Library Import¶

Drive Mount¶

Data ingestion and dataset validation¶

Data Ingestion¶

Data Checkpoint¶

Data Sample Inspection¶

Data Sample Inspection (Tail)¶

Dataset Dimensions¶

Schema and Dtypes¶

Descriptive Statistics¶

Data quality checks¶

Duplicate Record Validation¶

Missing Value Assessment¶

Exploratory data analysis¶

Utility: Histogram Boxplot¶

Distribution: Product Weight¶

Distribution: Product Allocated Area¶

Distribution: Product MRP¶

Distribution: Product Store Sales¶

Utility: Labeled Barplot¶

Frequency Analysis: Product_Sugar_Content¶

Frequency Analysis: Product_Type¶

Frequency Analysis: Store_Id¶

Frequency Analysis: Store_Size¶

Frequency Analysis: Store_Location_City_Type¶

Frequency Analysis: Store_Type¶

Correlation Matrix¶

Bivariate Analysis¶

Bivariate Analysis¶

Bivariate Analysis¶

Revenue Aggregation¶

Revenue Aggregation¶

Analytical Step¶

Revenue Aggregation¶

Revenue Aggregation¶

Revenue Aggregation¶

Distribution Comparison: Store vs Sales¶

Distribution Comparison¶

Distribution Comparison¶

Distribution Comparison¶

Cross-tab heatmap: sugar content by product type¶

Cross-tab heatmap: store by product type¶

Distribution Comparison¶

Distribution Comparison¶

Store-level performance analysis¶

Store OUT001: Statistical Profile¶

Store OUT001: Total Revenue¶

Store OUT001: Total Revenue¶

Store OUT002: Statistical Profile¶

Store OUT002: Total Revenue¶

Store OUT002: Total Revenue¶

Store OUT003: Statistical Profile¶

Store OUT003: Total Revenue¶

Store OUT003: Total Revenue¶

Store OUT004: Statistical Profile¶

Store OUT004: Total Revenue¶

Store OUT004: Total Revenue¶

Revenue by product type by store¶

Analytical Step¶

Data Sample Inspection¶

Feature Engineering and Column Dropping¶

Dataset Dimensions¶

Correlation Matrix¶

Model Definition¶

Dataset Dimensions¶

Model Training Loop¶

Model comparison summary¶

Performance Aggregation¶

Best Model Selection¶

Model Serialization¶

8. Deployment Assets (Hugging Face)¶