AWS Redshift

This section shows how to connect Amazon Web Services (AWS) Redshift as a data source on the Platform.

Python

# Platform Kernels: Python 2,3
# Libraries: psycopg2==2.7.1, pandas==0.20.3

# We adopt the psycopg2 client library to connect to
# postgresdb like redshift:
import psycopg2
import os
import pandas as pd


def RS_postgres_query(query_str, creds):
    """A sample query to validate the working of the db connection.

    Parameters
    ----------

    query_str: string
        A string which contains the query you want to run.

    creds: dict
        Contains your RS/Postgres credentials. The dictionary should have
        five keys : "DATABASE_NAME", "REDSHIFT_PG_PORT", "REDSHIFT_PG_PASSWORD",
        "REDSHIFT_PG_USER", "REDSHIFT_HOST". Those should be stored
        as environment variables (os.envion[]) on the platform.


    Returns
    -------

    A pandas dataframe with the results of the query.

    """
    conn = psycopg2.connect(**creds)
    return pd.read_sql(query_str, conn)

Usage Example

# Platform Kernels: Python 2,3
# Libraries: psycopg2==2.7.1, pandas==0.20.3

import psycopg2
import os
import pandas as pd
from redshift_postgres import RS_postgres_query

# Put the access credentials to the database in your environment variables
# into a dictionary.
creds = dict(database=os.environ['DATABASE_NAME'],
             port=os.environ['REDSHIFT_PG_PORT'],
             password=os.environ['REDSHIFT_PG_PASSWORD'],
             user=os.environ['REDSHIFT_PG_USER'],
             host=os.environ['REDSHIFT_PG_HOST'])

results_df = RS_postgres_query("""SELECT * FROM my_table""", creds)

R

# THIS IS NOT WORKING IN JUPYTER NOTEBOOKS

# options(repos=structure(c(CRAN="https://cran.cnr.berkeley.edu/")))

#install.packages("RPostgreSQL")
require("RPostgreSQL")


query_redshift <- function(conn, query){
  #' Queries redshift database with specified query
  #'
  #' Parameters
  #' ----------
  #'
  #' conn: connection object
  #'
  #' query: string
  #'     SQL query to run

  df_postgres <- dbGetQuery(conn, query)
  return(df_postgres)

}

Usage Example

main <- function(){

# Make sure these are installed before running the code snippet below

## Install RJava
#!sudo apt-get update
#!sudo apt-get -y install default-jre
#!sudo apt-get -y install default-jdk
#!R CMD javareconf
#!sudo apt-get -y install r-cran-rjava

## Install driver for Postgres
#!sudo apt-get -y install libpq-dev

#install.packages("RPostgreSQL")
require("RPostgreSQL")

# loads the PostgreSQL driver
drv <- dbDriver("PostgreSQL")

# creates a connection to the postgres / redshift database
conn <- dbConnect(drv,
                  dbname = Sys.getenv("RS_DB"),
                  host = Sys.getenv("RS_HOST"),
                  port = Sys.getenv("RS_PORT"),
                  user = Sys.getenv("RS_USER"),
                  password = Sys.getenv("RS_PASSWORD"))

# define SQL query
query = "SELECT * FROM pg_catalog.pg_tables;"

# query redshift database
df_postgres = query_redshift(conn, query)
print(df_postgres)

}