# 1 Setup

## 1.1 Google Drive - Kaggle

In [None]:
# Google drive setup
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
import json

api_token = {"username":"xxxxxxxxx","key":"xxxxxxxxx"}

!mkdir .kaggle
!mkdir ~/.kaggle

with open('/content/.kaggle/kaggle.json', 'w') as file:
  json.dump(api_token, file)

!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle
!kaggle --version

## 1.2 Kaggle Data Download

In [None]:
# download data
!kaggle competitions download -c 11785-fall2021-hw3p2

In [None]:
!mkdir data

!unzip -qo './11785-fall2021-hw3p2.zip' -d data 

In [None]:
!ls data/

## 1.3 Library Installations

Install [ctcdecode](https://github.com/parlance/ctcdecode)

In [None]:
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget
%cd ctcdecode
!pip install .
%cd ..

Install [levenshtein distance calculation library](https://github.com/ztane/python-Levenshtein) 

In [None]:
!pip install python-Levenshtein

## 1.4 Libraries & Setup

In [None]:
import os
import sys
import time

import Levenshtein

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pdb
import gc
from tqdm.notebook import trange, tqdm

import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Check if cuda is available and set device
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

num_workers = 8 if cuda else 0

print("Cuda = ", str(cuda), " with num_workers = ", str(num_workers),  " system version = ", sys.version)

# 2 Data Loading

## 2.1 Load Data

In [None]:
# load training and dev data
train_data = np.load('data/HW3P2_Data/train.npy', allow_pickle=True)
train_labels = np.load('data/HW3P2_Data/train_labels.npy', allow_pickle=True)

dev_data = np.load('data/HW3P2_Data/dev.npy', allow_pickle=True)
dev_labels = np.load('data/HW3P2_Data/dev_labels.npy', allow_pickle=True)

# load test data
test_data = np.load('data/HW3P2_Data/test.npy', allow_pickle=True)

In [None]:
print(f'Train data: {train_data.shape}')
print(f'Train labels {train_labels.shape}')

print(f'Dev data: {dev_data.shape}')
print(f'Dev labels {dev_labels.shape}')

print(f'Test data: {test_data.shape}')

## 2.2 Custom Dataset Classes

In [None]:
# Define dataset class
class MyDataSet(Dataset):
  # load the dataset
  def __init__(self, x, y):
    self.X = x
    self.Y = y

  # get number of items/rows in dataset
  def __len__(self):
    return len(self.Y)

  # get row item at some index
  def __getitem__(self, index):
    x = torch.FloatTensor(self.X[index])
    y = torch.LongTensor(self.Y[index])

    return x, y

  def collate_fn(batch):
    # TODO: Pad sequence

In [None]:
# Define dataset class
class TestDataSet(Dataset):
  # load the dataset
  # TODO: replace x and y with dataset path and load data from here -> more efficient
  def __init__(self, x):
    self.X = x

  # get number of items/rows in dataset
  def __len__(self):
    return len(self.X) 

  # get row item at some index
  def __getitem__(self, index):
    x = torch.FloatTensor(self.X[index])
    return x

  def collate_fn(batch):
    # TODO: Pad X


## 2.3 Data Loaders

In [None]:
batch_size = ... # TODO: decide on batch size

# training data
train = MyDataSet(train_data, train_labels)
train_args = .... # TODO: remember to use collate_fn
train_loader = DataLoader(train, **train_args)

# validation data
dev = MyDataSet(dev_data, dev_labels)
dev_args = .... # TODO: remember to use collate_fn
dev_loader = DataLoader(dev, **dev_args)

# test data
test = TestDataSet(test_data)
test_args = .... # TODO: remember to use collate_fn
test_loader = DataLoader(test, **test_args)

# 2 Model Building

## 2.1 Model Creation

In [None]:
# TODO: Create model    
class LSTMModel(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, output_size):
    super(LSTMModel, self).__init__()

    
  def forward(self, x, lengths): 
    

## 2.2 Model Initialization

In [None]:
# create model
input_size = 
hidden_size = 
num_layers = 
output_size = 

model = LSTMModel(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
print(model)

# 4 Model Training

## 4.0 Set Hyperparameters

In [None]:
# Hyperparams


criterion = nn.CTCLoss()
optimizer = 

# You can add a LR scheduler

## 4.1 Train Epoch

In [None]:
# Train the model
def train_epoch(model, train_loader, criterion, optimizer):
  model.train()

  avg_loss = 0.0
  start = time.time()

  # TODO: Add logic here

  end = time.time()
  avg_loss /= len(train_loader) # average batch loss

  print(f'Training loss: {avg_loss} Time: {end - start}')
  return avg_loss

## 4.2 CTC Decoding

In [None]:
import sys
sys.path.append("data/HW3P2_Data")

from phoneme_list import PHONEME_MAP, PHONEME_LIST

In [None]:
from ctcdecode import CTCBeamDecoder

# TODO: Initialize decoder here
# In CTCBeamDecoder beam_width=1 (greedy search); beam_width>1 (beam search)

## 4.3 Validate Epoch

In [None]:
# Train the model
def validate_model(model, val_loader, criterion):

  avg_loss = 0.0
  running_dist = 0.0
  predictions = []

  with torch.no_grad():
    # model in validation mode 
    model.eval()

    start = time.time()

    # TODO: Add logic here (remember to decode output and compute distance)

    end = time.time()

    print(f'Validation loss: {avg_loss} Levenshtein distance: {running_dist} Time: {end - start}')
    return avg_loss, predictions, distances, running_dist

## 4.4 Run Epochs

In [None]:
# Define number of epochs
epochs = ...

best_loss = float('inf')

print('Start...')
for epoch in range(epochs):
  print('Epoch: ', epoch+1)

  training_loss = train_epoch(model, train_loader, criterion, optimizer)
  val_loss, predictions, distance, running_dist = validate_model(model, dev_loader, criterion)

  # save the best model
  if val_loss < best_loss:
    print('Best loss: {}, epoch: {}'.format(val_loss, epoch + 1))
    # TODO: Save model
    best_loss = val_loss

  print('='*40)
print('Done...')

# 5 Test Data

## 5.1 Make Predictions

## 5.2 Save Predictions to csv File

## 5.3 Submit Predictions