Katharguppe Notes: Pseudo code of the paper

# Initialize model with shared trunk and independent output heads

Initialize model with shared trunk and independent output heads

for each head in output_heads:

Initialize head-specific parameters

# Define the forward pass function

def forward_pass(input_sequence):

context = shared_trunk(input_sequence)

predictions = []

for head in output_heads:

prediction = head(context) # Each head predicts based on the shared context

predictions.append(prediction)

return predictions

# Define the loss calculation function

def calculate_loss(predictions, true_future_tokens):

losses = []

for i, prediction in enumerate(predictions):

# Calculate loss for each head's prediction; could be cross-entropy

loss = cross_entropy_loss(prediction, true_future_tokens[i])

losses.append(loss)

return losses

# Define the backpropagation function

def backpropagate(total_loss):

# Compute gradients for each parameter in the model based on the total loss

total_loss.backward() # Automatically updates parameters based on their contribution to the loss

# Define the parameter update function

def update_parameters(optimizer):

optimizer.step() # Updates the model parameters using computed gradients

optimizer.zero_grad() # Resets gradients after updating

# Example training loop

for epoch in range(num_epochs):

for batch in data_loader:

input_sequence, true_future_tokens = batch

predictions = forward_pass(input_sequence)

losses = calculate_loss(predictions, true_future_tokens)

total_loss = sum(losses) # Combine losses from all heads for backpropagation

backpropagate(total_loss)

update_parameters(optimizer)

Katharguppe Notes