AWS S3

This section shows how to connect Amazon Web Services (AWS) S3 as a data source on the Platform.

Python

# Platform Kernels: Python 2,3
# Libraries: boto3==1.4.4

import boto3
import os


def s3_ls(bucket_name, path, creds):
    """List contents of an S3 bucket specified in prefix 'path'

    Parameters
    ----------

    bucket_name: string
        Name of the bucket of interest

    path: string
        Prefix of interest. Do not include a "/" to start.
        e.g. for s3://a-bucket/folderA/ => path = "folderA/"

    creds: dict
        Contains your AWS S3 credentials. The dictionary should have
        two keys : "S3_ACCESS_KEY" and "secret_key". Those should be stored
        as environment variables (os.envion[]) on the platform.

    Returns
    -------

    A list of the files

    """
    s3_client = boto3.client(service_name='s3',
                             aws_access_key_id=creds["AWS_ACCESS_KEY_ID"],
                             aws_secret_access_key=creds["AWS_SECRET_ACCESS_KEY"])

    if path != '' and path[-1] != '/':
        path += '/'

    files = []
    directories = []

    try:
        for fname in s3_client.list_objects(Bucket=bucket_name, Prefix=path)['Contents']:

            if '/' not in fname['Key'].replace(path, ''):
                files.append(fname['Key'].replace(path, ''))
            elif fname['Key'].replace(path, '').split('/')[0] + '/' not in directories:
                directories.append(fname['Key'].replace(path, '').split('/')[0] + '/')

    except KeyError:
        return('Directory Not Found')

    return(directories + files)


def s3_pull_file(bucket_name, filepath, local_dir, creds):
    """Pull a file (any format) from S3 into the platform environment

    After the file has been pulled in,
    it can be read into Python using the usual methods (e.g. open())

    Parameters
    ----------

    bucket_name: string
        Name of the bucket of interest

    filepath: string
        File prefix of interest. Do not include a "/" to start.
        e.g. for s3://a-bucket/folderA/file.dat => filepath = "folderA/file.dat"

    local_dir: string
        Local path where you want to store the file.
        e.g. local_dir = 'tmp_storage/file_tmp.dat'

    creds: dict
        Contains your AWS S3 credentials. The dictionary should have
        two keys : "S3_ACCESS_KEY" and "S3_SECRET_KEY". Those should be stored
        as environment variables (os.envion[]) on the platform.

    """

    s3_client = boto3.client(service_name='s3',
                             aws_access_key_id=creds["AWS_ACCESS_KEY_ID"],
                             aws_secret_access_key=creds["AWS_SECRET_ACCESS_KEY"])
    
    # local values: 
    local_filename = os.path.basename(local_dir)
    local_dirname = os.path.dirname(local_dir)
    
    if not os.path.exists(local_dirname):
        os.makedirs(local_dirname)
        
    # Download the file
    s3_client.download_file(Bucket=bucket_name, Key=filepath, Filename=local_dir)
    
    print("Your file is now available at {}".format(local_dir))


def s3_push_file(bucket_name, local_filepath, s3_filepath, creds):
    """Push a file from the platform environment into S3

    Parameters
    ----------

    bucket_name: string
        Name of the bucket of interest

    local_filepath: string
        Local filepath of the file of interest (e.g. "/home/jupyter/data/filea.dat")

    s3_filepath: string
        prefix of the file to be stored on s3 (e.g. "docs/filea.dat")

    creds: dict
        Contains your AWS S3 credentials. The dictionary should have
        two keys : "S3_ACCESS_KEY" and "S3_SECRET_KEY". Those should be stored
        as environment variables (os.envion[]) on the platform.



    """
    s3_client = boto3.client(service_name='s3',
                             aws_access_key_id=creds["AWS_ACCESS_KEY_ID"],
                             aws_secret_access_key=creds["AWS_SECRET_ACCESS_KEY"])

    try:
        s3_client.upload_file(local_filepath, bucket_name, s3_filepath)
        print("Uploaded to " + "s3://" + bucket_name + "/" + s3_filepath)
    except BaseException as e:
        print("Upload error for " + local_filepath)
        print(str(e))

Usage Example

# Platform Kernels: Python 2,3
# Libraries: boto3==1.4.4

import boto3
import os
from s3 import s3_ls, s3_pull_file, s3_push_file

# Usage example:
creds = {"AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"],
         "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"]}
bucket_name = os.environ["S3_BUCKETNAME"]
home_path = os.path.expanduser('~')

# List the content of bucket
list_of_files = s3_ls(bucket_name, "prefix1/", creds)

# Pull A file from S3:
s3_pull_file(bucket_name, 'prefix1/README', "{0}/tmp/README".format(home_path), creds)

# Push A file to S3:
s3_push_file(bucket_name, "{0}/tmp/README".format(home_path), "prefix2/README", creds)

R

# Platform Kernels: R3
# Libraries: aws.s3==0.3.3, utils=3.3.2

library('aws.s3')
library('utils')


set_S3_keys <- function(YOUR_ACCESS_KEY, YOUR_SECRET_KEY) {
  #' Sets the credentials as environment variables.
  #'
  #' ALL aws.s3 functions will look for your keys as environment variables
  #' by default
  #'
  #' Parameters
  #' ----------
  #'
  #' YOUR_ACCESS_KEY: string
  #'     AWS S3 access key
  #'
  #' YOUR_SECRET_KEY: string
  #'     AWS S3 secret key

  Sys.setenv('AWS_ACCESS_KEY_ID' = YOUR_ACCESS_KEY,
             'AWS_SECRET_ACCESS_KEY' = YOUR_SECRET_KEY)
}

s3_ls <- function(bucket_name, path){
  #' List contents of an S3 bucket specified in prefix 'path'
  #'
  #' Parameters
  #' ----------
  #'
  #' bucket_name: string
  #' Name of the bucket of interest
  #'
  #' path: string
  #' Prefix of interest. Do not include a "/" to start.
  #' e.g. for s3://a-bucket/folderA/ => path = "folderA/"
  #'
  #' Returns
  #' -------
  #'
  #' A list of the files

  return(get_bucket(bucket_name, prefix = path))
}

s3_import_data <- function(bucket_name, filepath, read_func=NULL){
  #' Pull and import a file from S3 into memory
  #'
  #'
  #' Parameters
  #' ----------
  #'
  #' bucket_name: string
  #'     Name of the bucket of interest
  #'
  #' filepath: string
  #'     File prefix of interest. Do not include a "/" to start.
  #'     e.g. for s3://a-bucket/folderA/file.dat => filepath = "folderA/file.dat"
  #'
  #' read_func: function
  #'     Define the function used to import the data. Determined by object file type.
  #'     Example readers:
  #'        - writes an RData (default)
  #'        - read.csv (csv)
  #'        - read.table (text)
  #'        - read.xls (excel)
  #'        - read.mtp (minitab)
  #'        - read.spss (spss)
  #'

  if (is.null(read_func)){

    # Save an RData object to s3
    return(s3load(object = filepath, bucket = bucket_name))
  } else {

    # Use write_func to save to s3
    return(s3read_using(FUN = read_func,
                        object = filepath,
                        bucket = bucket_name))
  }
}


s3_export_data <- function(data, bucket_name, filepath, write_func=NULL){
  #' Push a object in memory to S3
  #'
  #'
  #' Parameters
  #' ----------
  #'
  #' data: object
  #'     Variable containing data to push to S3
  #'
  #' bucket_name: string
  #'     Name of the bucket of interest
  #'
  #' filepath: string
  #'     File prefix of interest. Do not include a "/" to start.
  #'     e.g. for s3://a-bucket/folderA/file.dat => filepath = "folderA/file.dat"
  #'
  #' read_func: function
  #'     Define the function used to import the data. Determined by object file type.
  #'     Example readers:
  #'        - writes an RData (default)
  #'        - write.csv (csv)
  #'        - write.table (text)

  if (is.null(write_func)){

    # Save an RData object to s3
    s3save(data, bucket = bucket_name, object = filepath)
  } else {

    # Use write_func to save to s3
    s3write_using(data,
                  FUN = write_func,
                  object = filepath,
                  bucket = bucket_name)
  }
}

Usage Example

main <- function() {

# The name of the bucket of interest:
bucket_name <- Sys.getenv("S3_BUCKETNAME")

set_S3_keys(YOUR_ACCESS_KEY = Sys.getenv("AWS_ACCESS_KEY_ID"),
            YOUR_SECRET_KEY = Sys.getenv("AWS_SECRET_ACCESS_KEY"))

# List the content of a bucket with a specified prefix:
s3_ls(bucket_name, path = 'prefix1/')

# Importing

# Pull a text file from S3:
s3_import_data(bucket_name, 'prefix1/data.txt', read_func=read.table)

# Pull a csv file from S3:
s3_import_data(bucket_name, 'prefix1/data.csv', read_func=read.csv)

# Pull RData file from S3:
s3_import_data(bucket_name, 'prefix1/data.RData')

# Exporting

data <- "README"

# Push a txt file to S3:
s3_export_data(data, bucket_name, 'prefix2/data.txt', write_func=write.table)

# Push a csv file to S3
s3_export_data(data, bucket_name, 'prefix2/data.csv', write_func=write.csv)

# Push a RData file to S3:
s3_export_data(data, bucket_name, 'prefix2/data.RData')

}