Discover the power of neural networks in analyzing tabular data through our comprehensive tutorial. In this video, we dive into the world of predictive modeling using neural networks, enabling you to unlock valuable insights and make accurate predictions from structured data.
We introduce the concept of neural networks and their ability to handle complex relationships and patterns in tabular data.
Using Python and popular deep learning library fastai we guide you through the step-by-step process of building and training a neural network model for predictive modeling. You'll learn how to preprocess the data, handle missing values, normalize features, and split the dataset into training and testing sets.
Through practical example and simulated dataset, we demonstrate how neural networks can be applied to tasks such as customer credit risk assessment. You'll witness firsthand how neural networks can uncover hidden patterns and make accurate predictions from tabular data.
# code below
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.tabular.all import *
# Simulated data
np.random.seed(0)
n_samples = 100000
n_features = 5
X = np.random.rand(n_samples, n_features)
age = X[:, 0]
gender = np.random.choice([0, 1], size=n_samples)
occupation = np.random.choice([0, 1, 2], size=n_samples)
zip_code = np.random.choice([0, 1, 2, 3], size=n_samples)
default_flag = np.random.choice([0, 1], size=n_samples, p=[0.8, 0.2])
data = pd.DataFrame({
'age': age,
'gender': gender,
'occupation': occupation,
'zip_code': zip_code,
'default_flag': default_flag
})
data['default_flag'] = data['default_flag'].astype(float)
# Data preprocessing
procs = [Categorify, FillMissing, Normalize] #FillMissing,
splits = RandomSplitter(valid_pct=0.2)(range_of(data))
to = TabularPandas(data, procs=procs, cat_names=['gender', 'occupation', 'zip_code'],
cont_names=['age'], y_names='default_flag', splits=splits)
dls = to.dataloaders(bs=64)
class CollabNN(Module):
def __init__(self, emb_szs, n_cont, layers, out_sz, y_range=None):
self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
self.emb_drop = nn.Dropout(0.4)
# Calculate the total number of embeddings
n_emb = sum(nf for _, nf in emb_szs)
self.bn_cont = nn.BatchNorm1d(n_emb + n_cont)
layer_sizes = [n_emb + n_cont] + layers
self.layers = nn.ModuleList([nn.Linear(layer_sizes[i], layer_sizes[i+1]) for i in range(len(layer_sizes)-1)])
self.fc = nn.Linear(layers[-1], out_sz)
self.y_range = y_range
def forward(self, x_cat, x_cont):
# Embedding each categorical variable
emb_gender = self.embeds[0](x_cat[:, 0])
emb_occupation = self.embeds[1](x_cat[:, 1])
emb_zip_code = self.embeds[2](x_cat[:, 2])
x = self.emb_drop(torch.cat([emb_gender, emb_occupation, emb_zip_code], 1))
x = torch.cat([x, x_cont], 1) # Combine categorical and continuous features
x = self.bn_cont(x)
for layer in self.layers:
x = F.relu(layer(x))
x = self.fc(x)
if self.y_range is not None:
x = torch.sigmoid(x)
x = x * (self.y_range[1] - self.y_range[0])
x = x + self.y_range[0]
return x
from torch.nn import BCEWithLogitsLoss
n_act = 100
emb_szs = [(3, n_act), (4, n_act), (5, n_act)] # Embedding sizes for categorical variables
n_cont = 1 # Number of continuous variables
layers = [100, 50] # Sizes of hidden layers
out_sz = 1 # Output size (binary classification)
y_range = (0, 1) # Range for the output (probability between 0 and 1)
model = CollabNN(emb_szs, n_cont, layers, out_sz, y_range)
learn = Learner(dls, model, loss_func=BCEWithLogitsLoss(), metrics=accuracy)
learn.fit_one_cycle(1, 1e-3)
# Set the model to evaluation mode
learn.model.eval()
# Get the test dataloader from the Learner object
test_dl = learn.dls.test_dl(data)
# Iterate over the test dataset and make predictions
predictions = []
with torch.no_grad():
for batch in test_dl:
x_cat, x_cont = batch[:2] # Extract categorical and continuous features from the batch
# Forward pass through the model
pred = learn.model(x_cat, x_cont)
# Convert predictions to probabilities by applying the sigmoid function
pred_prob = torch.sigmoid(pred)
# Round the probabilities to obtain binary predictions (0 or 1)
pred_label = torch.round(pred_prob)
# Append the predictions to the list
predictions.extend(pred_prob.squeeze().tolist())
# Convert the predictions to a NumPy array
predictions = np.array(predictions)
unique_labels = np.unique(predictions)
print(unique_labels)
Ещё видео!