Module `reclab.data_utils`

A utility module for loading and manipulating various datasets.

Expand source code

"""A utility module for loading and manipulating various datasets."""
import collections
import os
import urllib.request
import zipfile

import numpy as np
import pandas as pd


DATA_DIR = os.path.join(os.path.dirname(__file__), '../data')


def split_ratings(ratings, proportion, shuffle=False, seed=None):
    """Split a group of ratings into two groups.

    Parameters
    ----------
    ratings : dict
        The ratings to split.
    proportion : float
        The proportion of ratings that will be in the first group. Must be between 0 and 1.
    shuffle : bool
        Whether to shuffle the rating data.

    Returns
    -------
    ratings_1 : OrderedDict
        The first set of ratings.
    ratings_2 : OrderedDict
        The second set of ratings.

    """
    split_1 = collections.OrderedDict()
    split_2 = collections.OrderedDict()
    split_1_end = int(proportion * len(ratings))
    iterator = list(ratings.items())

    if shuffle:
        if seed is not None:
            np.random.seed(seed)
        np.random.shuffle(iterator)

    for i, (key, val) in enumerate(iterator):
        if i < split_1_end:
            split_1[key] = val
        else:
            split_2[key] = val

    return split_1, split_2


def find_zipped(zipped_dir_name, data_name, data_url, csv_params):
    """Locate or download zipped file and load csv into DataFrame.

    Parameters
    ----------
    zipped_dir_name : str
        The directory within the downloaded zip.
    data_name : str
        The name of the data file to be loaded from the directory.
    data_url : str
        The location of the download.
    csv_params : str
        Parameters for loading csv into DataFrame.

    Returns
    -------
    data : DataFrame
        Dataset of interest.

    """
    data_dir = os.path.join(DATA_DIR, zipped_dir_name)
    datafile = os.path.join(data_dir, data_name)
    if not os.path.isfile(datafile):
        os.makedirs(DATA_DIR, exist_ok=True)

        download_location = os.path.join('{}.zip'.format(data_dir))
        urllib.request.urlretrieve(data_url,
                                   filename=download_location)
        with zipfile.ZipFile(download_location, 'r') as zip_ref:
            zip_ref.extractall(DATA_DIR)
        os.remove(download_location)
    data = pd.read_csv(datafile, **csv_params)
    return data


def find_npz(dir_name, data_name, data_url, np_params):
    """Locate or download npz file and load into DataFrame.

    Parameters
    ----------
    dir_name : str
        The directory to put the .npz file.
    data_name : str
        The name of the .npz file.
    data_url : str
        The location of the download.
    csv_params : str
        Parameters for loading the numpy array into DataFrame.

    Returns
    -------
    data : DataFrame
        Dataset of interest.

    """
    download_dir = os.path.join(DATA_DIR, dir_name)
    datafile = os.path.join(download_dir, data_name)
    if not os.path.isfile(datafile):
        os.makedirs(download_dir, exist_ok=True)
        urllib.request.urlretrieve(data_url, filename=datafile)
    data_np = np.load(datafile, allow_pickle=True)['train_data']
    data = pd.DataFrame(data_np, **np_params)
    # TODO: deal better with implicit ratings
    data['rating'] = 1
    return data


def get_data(name):
    """Read a dataset specified by name into pandas dataframe.

    Parameters
    ----------
    name : str
        The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'ml-1m',
        'citeulike-a', 'pinterest', or 'lastfm'.

    Returns
    -------
    data : DataFrame
        Dataset of interest.

    """
    if name == 'ml-100k':
        zipped_dir_name = 'ml-100k'
        data_name = 'u.data'
        data_url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
        csv_params = dict(sep='\t', header=None, usecols=[0, 1, 2, 3],
                          names=['user_id', 'item_id', 'rating', 'timestamp'])
        data = find_zipped(zipped_dir_name, data_name, data_url, csv_params)
    elif name == 'ml-10m':
        zipped_dir_name = 'ml-10M100K'
        data_name = 'ratings.dat'
        data_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip'
        csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3],
                          names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
        data = find_zipped(zipped_dir_name, data_name, data_url, csv_params)
    elif name == 'ml-1m':
        zipped_dir_name = 'ml-1m'
        data_name = 'ratings.dat'
        data_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
        csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3],
                          names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
        data = find_zipped(zipped_dir_name, data_name, data_url, csv_params)
    elif name == 'citeulike-a':
        dir_name = 'citeulike-a'
        data_name = 'data.npz'
        data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/'
                    'master/data/citeulike-a.npz')
        np_params = dict(columns=['user_id', 'item_id'])
        data = find_npz(dir_name, data_name, data_url, np_params)
    elif name == 'pinterest':
        dir_name = 'pinterest'
        data_name = 'data.npz'
        data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/'
                    'master/data/pinterest.npz')
        np_params = dict(columns=['user_id', 'item_id'])
        data = find_npz(dir_name, data_name, data_url, np_params)
    elif name == 'lastfm':
        data_name = 'lastfm-dataset-1K/lfm1k-play-counts.csv'
        csv_params = dict(header=0, usecols=[0, 1, 2],
                          names=['user_id', 'item_id', 'rating'])
        datafile = os.path.join(DATA_DIR, data_name)
        try:
            data = pd.read_csv(datafile, **csv_params)
            # log transform for better scaling
            data['rating'] = np.log(1 + data['rating'])
            # TODO: remove artists with less than 50 total listens?
            # otherwise should probably retrain for hyperparameter tuning...
        except FileNotFoundError as error:
            print(('LastFM data must be downloaded and preprocessed locally, '
                   'get files from https://drive.google.com/open?id=1qxmsQHe'
                   'D8O-81CbHxvaFP8omMvMxgEh0'))
            raise error
    else:
        raise ValueError('dataset name not recognized')
    return data


def read_dataset(name, shuffle=True):
    """Read a dataset as specified by name.

    Parameters
    ----------
    name : str
        The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a',
        'pinterest', or 'lastfm'.
    shuffle : bool, optional
        A flag to indicate whether the dataset should be shuffled after loading,
        true by default.

    Returns
    -------
    users : dict
        The dict of all users where the key is the user-id and the value is the user's features.
    items : dict
        The dict of all items where the key is the item-id and the value is the item's features.
    ratings : dict
        The dict of all ratings where the key is a tuple whose first element is the user-id
        and whose second element is the item id. The value is a tuple whose first element is the
        rating value and whose second element is the rating context (in this case an empty array).

    """
    data = get_data(name)

    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)

    users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])}
    items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])}

    # Fill the rating array with initial data.
    ratings = {}
    for user_id, item_id, rating in zip(data['user_id'], data['item_id'], data['rating']):
        # TODO: may want to eventually a rating context depending on dataset (e.g. time)
        ratings[user_id, item_id] = (rating, np.zeros(0))

    return users, items, ratings


def get_time_split_dataset(name, shuffle=True, binarize=False):
    """Get a time-based test/train split of a dataset as specified by name.

    Parameters
    ----------
    name : str
        The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a',
        'pinterest', or 'lastfm'.
    shuffle : bool, optional
        A flag to indicate whether the dataset should be shuffled after loading,
        true by default.
    binarize : bool, optional
        A flag to indicate whether to binarize the ratings to be 0 or 1,
        true by default.

    Returns
    -------
    users : dict
        The dict of all users where the key is the user-id and the value is the user's features.
    items : dict
        The dict of all items where the key is the item-id and the value is the item's features.
    train_ratings : dict
        The dict of all training ratings.
    test_ratings : dict
        The dict of all testing ratings.

    """
    data = get_data(name)
    if binarize:
        data['rating'] = 1

    users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])}
    items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])}

    # Add final rating to test set
    test_idx = []
    for uid in np.unique(data['user_id']):
        last_rating_idx = data[data['user_id'] == uid]['timestamp'].idxmax()
        test_idx.append(last_rating_idx)
    data_test = data.loc[test_idx]
    data_train = data.drop(test_idx)

    # Shuffle remaining data
    if shuffle:
        data_train = data_train.sample(frac=1).reset_index(drop=True)

    # Fill the rating array with initial data.
    train_ratings = {}
    for user_id, item_id, rating in zip(data_train['user_id'], data_train['item_id'],
                                        data_train['rating']):
        # TODO: may want to eventually a rating context depending on dataset (e.g. time)
        train_ratings[user_id, item_id] = (rating, np.zeros(0))

    # Fill the rating array with initial data.
    test_ratings = {}
    for user_id, item_id, rating in zip(data_test['user_id'], data_test['item_id'],
                                        data_test['rating']):
        # TODO: may want to eventually a rating context depending on dataset (e.g. time)
        test_ratings[user_id, item_id] = (rating, np.zeros(0))

    return users, items, train_ratings, test_ratings

Functions

def find_npz(dir_name, data_name, data_url, np_params)

Locate or download npz file and load into DataFrame.

Parameters

dir_name : str: The directory to put the .npz file.
data_name : str: The name of the .npz file.
data_url : str: The location of the download.
csv_params : str: Parameters for loading the numpy array into DataFrame.

Returns

data : DataFrame: Dataset of interest.

Expand source code

def find_npz(dir_name, data_name, data_url, np_params):
    """Locate or download npz file and load into DataFrame.

    Parameters
    ----------
    dir_name : str
        The directory to put the .npz file.
    data_name : str
        The name of the .npz file.
    data_url : str
        The location of the download.
    csv_params : str
        Parameters for loading the numpy array into DataFrame.

    Returns
    -------
    data : DataFrame
        Dataset of interest.

    """
    download_dir = os.path.join(DATA_DIR, dir_name)
    datafile = os.path.join(download_dir, data_name)
    if not os.path.isfile(datafile):
        os.makedirs(download_dir, exist_ok=True)
        urllib.request.urlretrieve(data_url, filename=datafile)
    data_np = np.load(datafile, allow_pickle=True)['train_data']
    data = pd.DataFrame(data_np, **np_params)
    # TODO: deal better with implicit ratings
    data['rating'] = 1
    return data

def find_zipped(zipped_dir_name, data_name, data_url, csv_params)

Locate or download zipped file and load csv into DataFrame.

Parameters

zipped_dir_name : str: The directory within the downloaded zip.
data_name : str: The name of the data file to be loaded from the directory.
data_url : str: The location of the download.
csv_params : str: Parameters for loading csv into DataFrame.

Returns

data : DataFrame: Dataset of interest.

Expand source code

def find_zipped(zipped_dir_name, data_name, data_url, csv_params):
    """Locate or download zipped file and load csv into DataFrame.

    Parameters
    ----------
    zipped_dir_name : str
        The directory within the downloaded zip.
    data_name : str
        The name of the data file to be loaded from the directory.
    data_url : str
        The location of the download.
    csv_params : str
        Parameters for loading csv into DataFrame.

    Returns
    -------
    data : DataFrame
        Dataset of interest.

    """
    data_dir = os.path.join(DATA_DIR, zipped_dir_name)
    datafile = os.path.join(data_dir, data_name)
    if not os.path.isfile(datafile):
        os.makedirs(DATA_DIR, exist_ok=True)

        download_location = os.path.join('{}.zip'.format(data_dir))
        urllib.request.urlretrieve(data_url,
                                   filename=download_location)
        with zipfile.ZipFile(download_location, 'r') as zip_ref:
            zip_ref.extractall(DATA_DIR)
        os.remove(download_location)
    data = pd.read_csv(datafile, **csv_params)
    return data

def get_data(name)

Read a dataset specified by name into pandas dataframe.

Parameters

name : str: The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'ml-1m', 'citeulike-a', 'pinterest', or 'lastfm'.

Returns

data : DataFrame: Dataset of interest.

Expand source code

def get_data(name):
    """Read a dataset specified by name into pandas dataframe.

    Parameters
    ----------
    name : str
        The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'ml-1m',
        'citeulike-a', 'pinterest', or 'lastfm'.

    Returns
    -------
    data : DataFrame
        Dataset of interest.

    """
    if name == 'ml-100k':
        zipped_dir_name = 'ml-100k'
        data_name = 'u.data'
        data_url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
        csv_params = dict(sep='\t', header=None, usecols=[0, 1, 2, 3],
                          names=['user_id', 'item_id', 'rating', 'timestamp'])
        data = find_zipped(zipped_dir_name, data_name, data_url, csv_params)
    elif name == 'ml-10m':
        zipped_dir_name = 'ml-10M100K'
        data_name = 'ratings.dat'
        data_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip'
        csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3],
                          names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
        data = find_zipped(zipped_dir_name, data_name, data_url, csv_params)
    elif name == 'ml-1m':
        zipped_dir_name = 'ml-1m'
        data_name = 'ratings.dat'
        data_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
        csv_params = dict(sep='::', header=None, usecols=[0, 1, 2, 3],
                          names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
        data = find_zipped(zipped_dir_name, data_name, data_url, csv_params)
    elif name == 'citeulike-a':
        dir_name = 'citeulike-a'
        data_name = 'data.npz'
        data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/'
                    'master/data/citeulike-a.npz')
        np_params = dict(columns=['user_id', 'item_id'])
        data = find_npz(dir_name, data_name, data_url, np_params)
    elif name == 'pinterest':
        dir_name = 'pinterest'
        data_name = 'data.npz'
        data_url = ('https://raw.githubusercontent.com/tebesu/CollaborativeMemoryNetwork/'
                    'master/data/pinterest.npz')
        np_params = dict(columns=['user_id', 'item_id'])
        data = find_npz(dir_name, data_name, data_url, np_params)
    elif name == 'lastfm':
        data_name = 'lastfm-dataset-1K/lfm1k-play-counts.csv'
        csv_params = dict(header=0, usecols=[0, 1, 2],
                          names=['user_id', 'item_id', 'rating'])
        datafile = os.path.join(DATA_DIR, data_name)
        try:
            data = pd.read_csv(datafile, **csv_params)
            # log transform for better scaling
            data['rating'] = np.log(1 + data['rating'])
            # TODO: remove artists with less than 50 total listens?
            # otherwise should probably retrain for hyperparameter tuning...
        except FileNotFoundError as error:
            print(('LastFM data must be downloaded and preprocessed locally, '
                   'get files from https://drive.google.com/open?id=1qxmsQHe'
                   'D8O-81CbHxvaFP8omMvMxgEh0'))
            raise error
    else:
        raise ValueError('dataset name not recognized')
    return data

def get_time_split_dataset(name, shuffle=True, binarize=False)

Get a time-based test/train split of a dataset as specified by name.

Parameters

name : str: The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a', 'pinterest', or 'lastfm'.
shuffle : bool, optional: A flag to indicate whether the dataset should be shuffled after loading, true by default.
binarize : bool, optional: A flag to indicate whether to binarize the ratings to be 0 or 1, true by default.

Returns

users : dict: The dict of all users where the key is the user-id and the value is the user's features.
items : dict: The dict of all items where the key is the item-id and the value is the item's features.
train_ratings : dict: The dict of all training ratings.
test_ratings : dict: The dict of all testing ratings.

Expand source code

def get_time_split_dataset(name, shuffle=True, binarize=False):
    """Get a time-based test/train split of a dataset as specified by name.

    Parameters
    ----------
    name : str
        The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a',
        'pinterest', or 'lastfm'.
    shuffle : bool, optional
        A flag to indicate whether the dataset should be shuffled after loading,
        true by default.
    binarize : bool, optional
        A flag to indicate whether to binarize the ratings to be 0 or 1,
        true by default.

    Returns
    -------
    users : dict
        The dict of all users where the key is the user-id and the value is the user's features.
    items : dict
        The dict of all items where the key is the item-id and the value is the item's features.
    train_ratings : dict
        The dict of all training ratings.
    test_ratings : dict
        The dict of all testing ratings.

    """
    data = get_data(name)
    if binarize:
        data['rating'] = 1

    users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])}
    items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])}

    # Add final rating to test set
    test_idx = []
    for uid in np.unique(data['user_id']):
        last_rating_idx = data[data['user_id'] == uid]['timestamp'].idxmax()
        test_idx.append(last_rating_idx)
    data_test = data.loc[test_idx]
    data_train = data.drop(test_idx)

    # Shuffle remaining data
    if shuffle:
        data_train = data_train.sample(frac=1).reset_index(drop=True)

    # Fill the rating array with initial data.
    train_ratings = {}
    for user_id, item_id, rating in zip(data_train['user_id'], data_train['item_id'],
                                        data_train['rating']):
        # TODO: may want to eventually a rating context depending on dataset (e.g. time)
        train_ratings[user_id, item_id] = (rating, np.zeros(0))

    # Fill the rating array with initial data.
    test_ratings = {}
    for user_id, item_id, rating in zip(data_test['user_id'], data_test['item_id'],
                                        data_test['rating']):
        # TODO: may want to eventually a rating context depending on dataset (e.g. time)
        test_ratings[user_id, item_id] = (rating, np.zeros(0))

    return users, items, train_ratings, test_ratings

def read_dataset(name, shuffle=True)

Read a dataset as specified by name.

Parameters

name : str: The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a', 'pinterest', or 'lastfm'.
shuffle : bool, optional: A flag to indicate whether the dataset should be shuffled after loading, true by default.

Returns

users : dict: The dict of all users where the key is the user-id and the value is the user's features.
items : dict: The dict of all items where the key is the item-id and the value is the item's features.
ratings : dict: The dict of all ratings where the key is a tuple whose first element is the user-id and whose second element is the item id. The value is a tuple whose first element is the rating value and whose second element is the rating context (in this case an empty array).

Expand source code

def read_dataset(name, shuffle=True):
    """Read a dataset as specified by name.

    Parameters
    ----------
    name : str
        The name of the dataset. Must be one of: 'ml-100k', 'ml-10m', 'citeulike-a',
        'pinterest', or 'lastfm'.
    shuffle : bool, optional
        A flag to indicate whether the dataset should be shuffled after loading,
        true by default.

    Returns
    -------
    users : dict
        The dict of all users where the key is the user-id and the value is the user's features.
    items : dict
        The dict of all items where the key is the item-id and the value is the item's features.
    ratings : dict
        The dict of all ratings where the key is a tuple whose first element is the user-id
        and whose second element is the item id. The value is a tuple whose first element is the
        rating value and whose second element is the rating context (in this case an empty array).

    """
    data = get_data(name)

    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)

    users = {user_id: np.zeros(0) for user_id in np.unique(data['user_id'])}
    items = {item_id: np.zeros(0) for item_id in np.unique(data['item_id'])}

    # Fill the rating array with initial data.
    ratings = {}
    for user_id, item_id, rating in zip(data['user_id'], data['item_id'], data['rating']):
        # TODO: may want to eventually a rating context depending on dataset (e.g. time)
        ratings[user_id, item_id] = (rating, np.zeros(0))

    return users, items, ratings

def split_ratings(ratings, proportion, shuffle=False, seed=None)

Split a group of ratings into two groups.

Parameters

ratings : dict: The ratings to split.
proportion : float: The proportion of ratings that will be in the first group. Must be between 0 and 1.
shuffle : bool: Whether to shuffle the rating data.

Returns

ratings_1 : OrderedDict: The first set of ratings.
ratings_2 : OrderedDict: The second set of ratings.

Expand source code

def split_ratings(ratings, proportion, shuffle=False, seed=None):
    """Split a group of ratings into two groups.

    Parameters
    ----------
    ratings : dict
        The ratings to split.
    proportion : float
        The proportion of ratings that will be in the first group. Must be between 0 and 1.
    shuffle : bool
        Whether to shuffle the rating data.

    Returns
    -------
    ratings_1 : OrderedDict
        The first set of ratings.
    ratings_2 : OrderedDict
        The second set of ratings.

    """
    split_1 = collections.OrderedDict()
    split_2 = collections.OrderedDict()
    split_1_end = int(proportion * len(ratings))
    iterator = list(ratings.items())

    if shuffle:
        if seed is not None:
            np.random.seed(seed)
        np.random.shuffle(iterator)

    for i, (key, val) in enumerate(iterator):
        if i < split_1_end:
            split_1[key] = val
        else:
            split_2[key] = val

    return split_1, split_2