Stock Price Prediction using Linear Regression

The notebook linear_regression.ipynb contains examples for the prediction of stock prices using OLS with statsmodels and sklearn, as well as ridge and lasso models.

It is designed to run as a notebook on the Quantopian research platform.

How to run this notebook

This notebook is written for the Quantopian research environment.

Imports

In [3]:
import pandas as pd
import numpy as np
from time import time
import talib
import re
from statsmodels.api import OLS
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegression
from sklearn.preprocessing import StandardScaler

from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline, factors, filters, classifiers
from quantopian.pipeline.data.builtin import USEquityPricing

from quantopian.pipeline.factors import (Latest, 
                                         Returns, 
                                         AverageDollarVolume, 
                                         SimpleMovingAverage,
                                         EWMA,
                                         BollingerBands,
                                         CustomFactor,
                                         MarketCap,
                                        SimpleBeta)

from quantopian.pipeline.filters import QTradableStocksUS, StaticAssets
from quantopian.pipeline.data.quandl import fred_usdontd156n as libor
from empyrical import max_drawdown, sortino_ratio

import seaborn as sns
import matplotlib.pyplot as plt

Data Sources

In [4]:
################
# Fundamentals #
################

# Morningstar fundamentals (2002 - Ongoing)
# https://www.quantopian.com/help/fundamentals
from quantopian.pipeline.data import Fundamentals

#####################
# Analyst Estimates #
#####################

# Earnings Surprises - Zacks (27 May 2006 - Ongoing)
# https://www.quantopian.com/data/zacks/earnings_surprises
from quantopian.pipeline.data.zacks import EarningsSurprises
from quantopian.pipeline.factors.zacks import BusinessDaysSinceEarningsSurprisesAnnouncement

##########
# Events #
##########

# Buyback Announcements - EventVestor (01 Jun 2007 - Ongoing)
# https://www.quantopian.com/data/eventvestor/buyback_auth
from quantopian.pipeline.data.eventvestor import BuybackAuthorizations
from quantopian.pipeline.factors.eventvestor import BusinessDaysSinceBuybackAuth

# CEO Changes - EventVestor (01 Jan 2007 - Ongoing)
# https://www.quantopian.com/data/eventvestor/ceo_change
from quantopian.pipeline.data.eventvestor import CEOChangeAnnouncements

# Dividends - EventVestor (01 Jan 2007 - Ongoing)
# https://www.quantopian.com/data/eventvestor/dividends
from quantopian.pipeline.data.eventvestor import (
    DividendsByExDate,
    DividendsByPayDate,
    DividendsByAnnouncementDate,
)
from quantopian.pipeline.factors.eventvestor import (
    BusinessDaysSincePreviousExDate,
    BusinessDaysUntilNextExDate,
    BusinessDaysSinceDividendAnnouncement,
)

# Earnings Calendar - EventVestor (01 Jan 2007 - Ongoing)
# https://www.quantopian.com/data/eventvestor/earnings_calendar
from quantopian.pipeline.data.eventvestor import EarningsCalendar
from quantopian.pipeline.factors.eventvestor import (
    BusinessDaysUntilNextEarnings,
    BusinessDaysSincePreviousEarnings
)

# 13D Filings - EventVestor (01 Jan 2007 - Ongoing)
# https://www.quantopian.com/data/eventvestor/_13d_filings
from quantopian.pipeline.data.eventvestor import _13DFilings
from quantopian.pipeline.factors.eventvestor import BusinessDaysSince13DFilingsDate

#############
# Sentiment #
#############

# News Sentiment - Sentdex Sentiment Analysis (15 Oct 2012 - Ongoing)
# https://www.quantopian.com/data/sentdex/sentiment
from quantopian.pipeline.data.sentdex import sentiment

Prepare the Data

We need to select a universe of equities and a time horizon, build and transform alpha factors that we will use as features, calculate forward returns that we aim to predict, and potentially clean our data.

Time horizon

In [5]:
# trading days per period
MONTH = 21
YEAR = 12 * MONTH
In [7]:
START = '2017-01-01'
END = '2018-12-31'

Universe

We will use equity data for the years 2014 and 2015 from a custom Q50US universe that uses built-in filters, factors, and classifiers to select the 50 stocks with the highest average dollar volume of the last 200 trading days filtered by additional default criteria (see Quantopian docs linked on GitHub for detail). The universe dynamically updates based on the filter criteria so that, while there are 100 stocks at any given point, there may be more than 50 distinct equities in the sample:

In [8]:
def Q50US():
    return filters.make_us_equity_universe(
        target_size=50,
        rankby=factors.AverageDollarVolume(window_length=200),
        mask=filters.default_us_equity_universe_mask(),
        groupby=classifiers.fundamentals.Sector(),
        max_group_weight=0.3,
        smoothing_func=lambda f: f.downsample('month_start'),
    )
In [9]:
# UNIVERSE = StaticAssets(symbols(['MSFT', 'AAPL']))
UNIVERSE = Q50US()

Factor Transformations

In [10]:
class AnnualizedData(CustomFactor):
    # Get the sum of the last 4 reported values
    window_length = 260

    def compute(self, today, assets, out, asof_date, values):
        for asset in range(len(assets)):
            # unique asof dates indicate availability of new figures
            _, filing_dates = np.unique(asof_date[:, asset], return_index=True)
            quarterly_values = values[filing_dates[-4:], asset]
            # ignore annual windows with <4 quarterly data points
            if len(~np.isnan(quarterly_values)) != 4:
                out[asset] = np.nan
            else:
                out[asset] = np.sum(quarterly_values)
In [11]:
class AnnualAvg(CustomFactor):
    window_length = 252
    
    def compute(self, today, assets, out, values):
        out[:] = (values[0] + values[-1])/2
In [12]:
def run_pipeline_chunks(pipe, start_date, end_date, chunks_len = None):
    
    chunks  = []
    current = pd.Timestamp(start_date)
    end     = pd.Timestamp(end_date)
    step    = pd.Timedelta(weeks=26) if chunks_len is None else chunks_len
    
    start_pipeline_timer = time()
    
    while current <= end:
        
        current_end = current + step
        if current_end > end:
            current_end = end
        
        start_timer = time()
        print 'Running pipeline:', current, ' - ', current_end
        results = run_pipeline(pipe, current.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"))
        chunks.append(results)
        
        # pipeline returns more days than requested (if no trading day), so get last date from the results
        current_end = results.index.get_level_values(0)[-1].tz_localize(None)
        current = current_end + pd.Timedelta(days=1)
        
        end_timer = time()
        print "Time to run this chunk of the pipeline %.2f secs" % (end_timer - start_timer)
        
    end_pipeline_timer = time()
    print "Time to run the entire pipeline %.2f secs" % (end_pipeline_timer - start_pipeline_timer)
    return pd.concat(chunks)
In [13]:
def factor_pipeline(factors):
    start = time()
    pipe = Pipeline({k: v(mask=UNIVERSE).rank() for k, v in factors.items()},
                    screen=UNIVERSE)
    result = run_pipeline_chunks(pipe, start_date=START, end_date=END)
    return result, time() - start

Factor Library

Value Factors

In [14]:
class ValueFactors:
    """Definitions of factors for cross-sectional trading algorithms"""
    
    @staticmethod
    def PriceToSalesTTM(**kwargs):
        """Last closing price divided by sales per share"""        
        return Fundamentals.ps_ratio.latest

    @staticmethod
    def PriceToEarningsTTM(**kwargs):
        """Closing price divided by earnings per share (EPS)"""
        return Fundamentals.pe_ratio.latest
 
    @staticmethod
    def PriceToDilutedEarningsTTM(mask):
        """Closing price divided by diluted EPS"""
        last_close = USEquityPricing.close.latest
        diluted_eps = AnnualizedData(inputs = [Fundamentals.diluted_eps_earnings_reports_asof_date,
                                               Fundamentals.diluted_eps_earnings_reports],
                                     mask=mask)
        return last_close / diluted_eps

    @staticmethod
    def PriceToForwardEarnings(**kwargs):
        """Price to Forward Earnings"""
        return Fundamentals.forward_pe_ratio.latest
    
    @staticmethod
    def DividendYield(**kwargs):
        """Dividends per share divided by closing price"""
        return Fundamentals.trailing_dividend_yield.latest

    @staticmethod
    def PriceToFCF(mask):
        """Price to Free Cash Flow"""
        last_close = USEquityPricing.close.latest
        fcf_share = AnnualizedData(inputs = [Fundamentals.fcf_per_share_asof_date,
                                             Fundamentals.fcf_per_share],
                                   mask=mask)
        return last_close / fcf_share

    @staticmethod
    def PriceToOperatingCashflow(mask):
        """Last Close divided by Operating Cash Flows"""
        last_close = USEquityPricing.close.latest
        cfo_per_share = AnnualizedData(inputs = [Fundamentals.cfo_per_share_asof_date,
                                                 Fundamentals.cfo_per_share],
                                       mask=mask)        
        return last_close / cfo_per_share

    @staticmethod
    def PriceToBook(mask):
        """Closing price divided by book value"""
        last_close = USEquityPricing.close.latest
        book_value_per_share = AnnualizedData(inputs = [Fundamentals.book_value_per_share_asof_date,
                                              Fundamentals.book_value_per_share],
                                             mask=mask)        
        return last_close / book_value_per_share


    @staticmethod
    def EVToFCF(mask):
        """Enterprise Value divided by Free Cash Flows"""
        fcf = AnnualizedData(inputs = [Fundamentals.free_cash_flow_asof_date,
                                       Fundamentals.free_cash_flow],
                             mask=mask)
        return Fundamentals.enterprise_value.latest / fcf

    @staticmethod
    def EVToEBITDA(mask):
        """Enterprise Value to Earnings Before Interest, Taxes, Deprecation and Amortization (EBITDA)"""
        ebitda = AnnualizedData(inputs = [Fundamentals.ebitda_asof_date,
                                          Fundamentals.ebitda],
                                mask=mask)

        return Fundamentals.enterprise_value.latest / ebitda

    @staticmethod
    def EBITDAYield(mask):
        """EBITDA divided by latest close"""
        ebitda = AnnualizedData(inputs = [Fundamentals.ebitda_asof_date,
                                          Fundamentals.ebitda],
                                mask=mask)
        return USEquityPricing.close.latest / ebitda
In [15]:
VALUE_FACTORS = {
    'DividendYield'            : ValueFactors.DividendYield,
    'EBITDAYield'              : ValueFactors.EBITDAYield,
    'EVToEBITDA'               : ValueFactors.EVToEBITDA,
    'EVToFCF'                  : ValueFactors.EVToFCF,
    'PriceToBook'              : ValueFactors.PriceToBook,
    'PriceToDilutedEarningsTTM': ValueFactors.PriceToDilutedEarningsTTM,
    'PriceToEarningsTTM'       : ValueFactors.PriceToEarningsTTM,
    'PriceToFCF'               : ValueFactors.PriceToFCF,
    'PriceToForwardEarnings'   : ValueFactors.PriceToForwardEarnings,
    'PriceToOperatingCashflow' : ValueFactors.PriceToOperatingCashflow,
    'PriceToSalesTTM'          : ValueFactors.PriceToSalesTTM,
}
In [16]:
value_factors, t = factor_pipeline(VALUE_FACTORS)
print('Pipeline run time {:.2f} secs'.format(t))
value_factors.info()
Running pipeline: 2017-01-01 00:00:00  -  2017-07-02 00:00:00
Pipeline Execution Time: 49.51 Seconds
Time to run this chunk of the pipeline 52.05 secs
Running pipeline: 2017-07-04 00:00:00  -  2018-01-02 00:00:00
/venvs/py35/lib/python3.5/site-packages/numpy/lib/arraysetops.py:200: FutureWarning: In the future, NAT != NAT will be True rather than False.
  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
Pipeline Execution Time: 37.55 Seconds
Time to run this chunk of the pipeline 38.93 secs
Running pipeline: 2018-01-03 00:00:00  -  2018-07-04 00:00:00
Pipeline Execution Time: 38.12 Seconds
Time to run this chunk of the pipeline 39.53 secs
Running pipeline: 2018-07-06 00:00:00  -  2018-12-31 00:00:00
Pipeline Execution Time: 38.06 Seconds
Time to run this chunk of the pipeline 39.46 secs
Time to run the entire pipeline 169.98 secs
Pipeline run time 169.99 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 11 columns):
DividendYield                19739 non-null float64
EBITDAYield                  21929 non-null float64
EVToEBITDA                   21929 non-null float64
EVToFCF                      25005 non-null float64
PriceToBook                  25100 non-null float64
PriceToDilutedEarningsTTM    24985 non-null float64
PriceToEarningsTTM           24804 non-null float64
PriceToFCF                   25100 non-null float64
PriceToForwardEarnings       25080 non-null float64
PriceToOperatingCashflow     25100 non-null float64
PriceToSalesTTM              25100 non-null float64
dtypes: float64(11)
memory usage: 2.3+ MB

Momentum

In [17]:
class MomentumFactors:
    """Custom Momentum Factors"""
    class PercentAboveLow(CustomFactor):
        """Percentage of current close above low 
        in lookback window of window_length days
        """
        inputs = [USEquityPricing.close]
        window_length = 252

        def compute(self, today, assets, out, close):
            out[:] = close[-1] / np.min(close, axis=0) - 1

    class PercentBelowHigh(CustomFactor):
        """Percentage of current close below high 
        in lookback window of window_length days
        """
        
        inputs = [USEquityPricing.close]
        window_length = 252
            
        def compute(self, today, assets, out, close):
            out[:] = close[-1] / np.max(close, axis=0) - 1

    @staticmethod
    def make_dx(timeperiod=14):
        class DX(CustomFactor):
            """Directional Movement Index"""
            inputs = [USEquityPricing.high, 
                      USEquityPricing.low, 
                      USEquityPricing.close]
            window_length = timeperiod + 1
            
            def compute(self, today, assets, out, high, low, close):
                out[:] = [talib.DX(high[:, i], 
                                   low[:, i], 
                                   close[:, i], 
                                   timeperiod=timeperiod)[-1] 
                          for i in range(len(assets))]
        return DX  

    @staticmethod
    def make_mfi(timeperiod=14):
        class MFI(CustomFactor):
            """Money Flow Index"""
            inputs = [USEquityPricing.high, 
                      USEquityPricing.low, 
                      USEquityPricing.close,
                      USEquityPricing.volume]
            window_length = timeperiod + 1
            
            def compute(self, today, assets, out, high, low, close, vol):
                out[:] = [talib.MFI(high[:, i], 
                                    low[:, i], 
                                    close[:, i],
                                    vol[:, i],
                                    timeperiod=timeperiod)[-1] 
                          for i in range(len(assets))]
        return MFI           

    @staticmethod
    def make_oscillator(fastperiod=12, slowperiod=26, matype=0):
        class PPO(CustomFactor):
            """12/26-Day Percent Price Oscillator"""
            inputs = [USEquityPricing.close]
            window_length = slowperiod

            def compute(self, today, assets, out, close_prices):
                out[:] = [talib.PPO(close,
                                    fastperiod=fastperiod,
                                    slowperiod=slowperiod, 
                                    matype=matype)[-1]
                         for close in close_prices.T]
        return PPO

    @staticmethod
    def make_stochastic_oscillator(fastk_period=5, slowk_period=3, slowd_period=3, 
                                   slowk_matype=0, slowd_matype=0):                
        class StochasticOscillator(CustomFactor):
            """20-day Stochastic Oscillator """
            inputs = [USEquityPricing.high, 
                      USEquityPricing.low, 
                      USEquityPricing.close]
            outputs = ['slowk', 'slowd']
            window_length = fastk_period * 2
            
            def compute(self, today, assets, out, high, low, close):
                slowk, slowd = [talib.STOCH(high[:, i],
                                            low[:, i],
                                            close[:, i], 
                                            fastk_period=fastk_period,
                                            slowk_period=slowk_period, 
                                            slowk_matype=slowk_matype, 
                                            slowd_period=slowd_period, 
                                            slowd_matype=slowd_matype)[-1] 
                                for i in range(len(assets))]

                out.slowk[:], out.slowd[:] = slowk[-1], slowd[-1]
        return StochasticOscillator
    
    @staticmethod
    def make_trendline(timeperiod=252):                
        class Trendline(CustomFactor):
            inputs = [USEquityPricing.close]
            """52-Week Trendline"""
            window_length = timeperiod

            def compute(self, today, assets, out, close_prices):
                out[:] = [talib.LINEARREG_SLOPE(close, 
                                   timeperiod=timeperiod)[-1] 
                          for close in close_prices.T]
        return Trendline
In [18]:
MOMENTUM_FACTORS = {
    'Percent Above Low'            : MomentumFactors.PercentAboveLow,
    'Percent Below High'           : MomentumFactors.PercentBelowHigh,
    'Price Oscillator'             : MomentumFactors.make_oscillator(),
    'Money Flow Index'             : MomentumFactors.make_mfi(),
    'Directional Movement Index'   : MomentumFactors.make_dx(),
    'Trendline'                    : MomentumFactors.make_trendline()
}
In [19]:
momentum_factors, t = factor_pipeline(MOMENTUM_FACTORS)
print('Pipeline run time {:.2f} secs'.format(t))
momentum_factors.info()
Running pipeline: 2017-01-01 00:00:00  -  2017-07-02 00:00:00
Pipeline Execution Time: 5.02 Seconds
Time to run this chunk of the pipeline 6.43 secs
Running pipeline: 2017-07-04 00:00:00  -  2018-01-02 00:00:00
Pipeline Execution Time: 5.06 Seconds
Time to run this chunk of the pipeline 6.47 secs
Running pipeline: 2018-01-03 00:00:00  -  2018-07-04 00:00:00
Pipeline Execution Time: 4.99 Seconds
Time to run this chunk of the pipeline 6.44 secs
Running pipeline: 2018-07-06 00:00:00  -  2018-12-31 00:00:00
Pipeline Execution Time: 5.07 Seconds
Time to run this chunk of the pipeline 6.40 secs
Time to run the entire pipeline 25.75 secs
Pipeline run time 25.75 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 6 columns):
Directional Movement Index    25100 non-null float64
Money Flow Index              25100 non-null float64
Percent Above Low             25018 non-null float64
Percent Below High            25018 non-null float64
Price Oscillator              25100 non-null float64
Trendline                     25018 non-null float64
dtypes: float64(6)
memory usage: 1.3+ MB

Efficiency Factors

In [20]:
class EfficiencyFactors:

    @staticmethod
    def CapexToAssets(mask):
        """Capital Expenditure divided by Total Assets"""
        capex = AnnualizedData(inputs = [Fundamentals.capital_expenditure_asof_date,
                                         Fundamentals.capital_expenditure],
                                     mask=mask)   
        assets = Fundamentals.total_assets.latest
        return - capex / assets

    @staticmethod
    def CapexToSales(mask):
        """Capital Expenditure divided by Total Revenue"""
        capex = AnnualizedData(inputs = [Fundamentals.capital_expenditure_asof_date,
                                         Fundamentals.capital_expenditure],
                                     mask=mask)   
        revenue = AnnualizedData(inputs = [Fundamentals.total_revenue_asof_date,
                                         Fundamentals.total_revenue],
                                     mask=mask)         
        return - capex / revenue
  
    @staticmethod
    def CapexToFCF(mask):
        """Capital Expenditure divided by Free Cash Flows"""
        capex = AnnualizedData(inputs = [Fundamentals.capital_expenditure_asof_date,
                                         Fundamentals.capital_expenditure],
                                     mask=mask)   
        free_cash_flow = AnnualizedData(inputs = [Fundamentals.free_cash_flow_asof_date,
                                         Fundamentals.free_cash_flow],
                                     mask=mask)         
        return - capex / free_cash_flow

    @staticmethod
    def EBITToAssets(mask):
        """Earnings Before Interest and Taxes (EBIT) divided by Total Assets"""
        ebit = AnnualizedData(inputs = [Fundamentals.ebit_asof_date,
                                         Fundamentals.ebit],
                                     mask=mask)   
        assets = Fundamentals.total_assets.latest
        return ebit / assets
    
    @staticmethod
    def CFOToAssets(mask):
        """Operating Cash Flows divided by Total Assets"""
        cfo = AnnualizedData(inputs = [Fundamentals.operating_cash_flow_asof_date,
                                         Fundamentals.operating_cash_flow],
                                     mask=mask)   
        assets = Fundamentals.total_assets.latest
        return cfo / assets 
    
    @staticmethod
    def RetainedEarningsToAssets(mask):
        """Retained Earnings divided by Total Assets"""
        retained_earnings = AnnualizedData(inputs = [Fundamentals.retained_earnings_asof_date,
                                         Fundamentals.retained_earnings],
                                     mask=mask)   
        assets = Fundamentals.total_assets.latest
        return retained_earnings / assets
In [21]:
EFFICIENCY_FACTORS = {
    'CFO To Assets' :EfficiencyFactors.CFOToAssets,
    'Capex To Assets' :EfficiencyFactors.CapexToAssets,
    'Capex To FCF' :EfficiencyFactors.CapexToFCF,
    'Capex To Sales' :EfficiencyFactors.CapexToSales,
    'EBIT To Assets' :EfficiencyFactors.EBITToAssets,
    'Retained Earnings To Assets' :EfficiencyFactors.RetainedEarningsToAssets
    }
In [22]:
efficiency_factors, t = factor_pipeline(EFFICIENCY_FACTORS)
print('Pipeline run time {:.2f} secs'.format(t))
efficiency_factors.info()
Running pipeline: 2017-01-01 00:00:00  -  2017-07-02 00:00:00
Pipeline Execution Time: 9.52 Seconds
Time to run this chunk of the pipeline 10.96 secs
Running pipeline: 2017-07-04 00:00:00  -  2018-01-02 00:00:00
Pipeline Execution Time: 10.30 Seconds
Time to run this chunk of the pipeline 11.66 secs
Running pipeline: 2018-01-03 00:00:00  -  2018-07-04 00:00:00
Pipeline Execution Time: 10.36 Seconds
Time to run this chunk of the pipeline 11.82 secs
Running pipeline: 2018-07-06 00:00:00  -  2018-12-31 00:00:00
Pipeline Execution Time: 10.47 Seconds
Time to run this chunk of the pipeline 11.92 secs
Time to run the entire pipeline 46.36 secs
Pipeline run time 46.37 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 6 columns):
CFO To Assets                  25005 non-null float64
Capex To Assets                23566 non-null float64
Capex To FCF                   23566 non-null float64
Capex To Sales                 23566 non-null float64
EBIT To Assets                 22369 non-null float64
Retained Earnings To Assets    25005 non-null float64
dtypes: float64(6)
memory usage: 1.3+ MB

Risk Factors

In [23]:
class RiskFactors:

    @staticmethod
    def LogMarketCap(mask):
        """Log of Market Capitalization log(Close Price * Shares Outstanding)"""
        return np.log(MarketCap(mask=mask))
 
    class DownsideRisk(CustomFactor):
        """Mean returns divided by std of 1yr daily losses (Sortino Ratio)"""
        inputs = [USEquityPricing.close]
        window_length = 252

        def compute(self, today, assets, out, close):
            ret = pd.DataFrame(close).pct_change()
            out[:] = ret.mean().div(ret.where(ret<0).std())

    @staticmethod
    def MarketBeta(**kwargs):
        """Slope of 1-yr regression of price returns against index returns"""
        return SimpleBeta(target=symbols('SPY'), regression_length=252) 

    class DownsideBeta(CustomFactor):
        """Slope of 1yr regression of returns on negative index returns"""
        inputs = [USEquityPricing.close]
        window_length = 252

        def compute(self, today, assets, out, close):
            t = len(close)
            assets = pd.DataFrame(close).pct_change()
            
            start_date = (today - pd.DateOffset(years=1)).strftime('%Y-%m-%d')
            spy = get_pricing('SPY', 
                              start_date=start_date, 
                              end_date=today.strftime('%Y-%m-%d')).reset_index(drop=True)
            spy_neg_ret = (spy
                           .close_price
                           .iloc[-t:]
                           .pct_change()
                           .pipe(lambda x: x.where(x<0)))
    
            out[:] = assets.apply(lambda x: x.cov(spy_neg_ret)).div(spy_neg_ret.var())         

    class Vol3M(CustomFactor):
        """3-month Volatility: Standard deviation of returns over 3 months"""

        inputs = [USEquityPricing.close]
        window_length = 63

        def compute(self, today, assets, out, close):
            out[:] = np.log1p(pd.DataFrame(close).pct_change()).std()
In [24]:
RISK_FACTORS = {
    'Log Market Cap' : RiskFactors.LogMarketCap,
    'Downside Risk'  : RiskFactors.DownsideRisk,
    'Index Beta'     : RiskFactors.MarketBeta,
     #'Downside Beta'  : RiskFactors.DownsideBeta,    
    'Volatility 3M'  : RiskFactors.Vol3M,    
}
In [25]:
risk_factors, t = factor_pipeline(RISK_FACTORS)
print('Pipeline run time {:.2f} secs'.format(t))
risk_factors.info()
Running pipeline: 2017-01-01 00:00:00  -  2017-07-02 00:00:00
Pipeline Execution Time: 10.42 Seconds
Time to run this chunk of the pipeline 12.55 secs
Running pipeline: 2017-07-04 00:00:00  -  2018-01-02 00:00:00
Pipeline Execution Time: 10.37 Seconds
Time to run this chunk of the pipeline 11.74 secs
Running pipeline: 2018-01-03 00:00:00  -  2018-07-04 00:00:00
Pipeline Execution Time: 10.65 Seconds
Time to run this chunk of the pipeline 12.03 secs
Running pipeline: 2018-07-06 00:00:00  -  2018-12-31 00:00:00
Pipeline Execution Time: 10.49 Seconds
Time to run this chunk of the pipeline 11.87 secs
Time to run the entire pipeline 48.19 secs
Pipeline run time 55.26 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 4 columns):
Downside Risk     25100 non-null float64
Index Beta        25100 non-null float64
Log Market Cap    25100 non-null float64
Volatility 3M     25100 non-null float64
dtypes: float64(4)
memory usage: 980.5+ KB

Growth Factors

In [26]:
def growth_pipeline():
    revenue = AnnualizedData(inputs = [Fundamentals.total_revenue_asof_date,
                                       Fundamentals.total_revenue],
                             mask=UNIVERSE)
    eps = AnnualizedData(inputs = [Fundamentals.diluted_eps_earnings_reports_asof_date,
                                       Fundamentals.diluted_eps_earnings_reports],
                             mask=UNIVERSE)    

    return Pipeline({'Sales': revenue,
                     'EPS': eps,
                     'Total Assets': Fundamentals.total_assets.latest,
                     'Net Debt': Fundamentals.net_debt.latest},
                    screen=UNIVERSE)
In [27]:
start_timer = time()
growth_factors = run_pipeline(growth_pipeline(), start_date=START, end_date=END)

for col in growth_factors.columns:
    for month in [3, 12]:
        new_col = col + ' Growth {}M'.format(month)
        kwargs = {new_col: growth_factors[col].pct_change(month*MONTH).groupby(level=1).rank()}        
        growth_factors = growth_factors.assign(**kwargs)
print('Pipeline run time {:.2f} secs'.format(time() - start_timer))
growth_factors.info()
Pipeline Execution Time: 20.40 Seconds
Pipeline run time 24.33 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 12 columns):
EPS                        24985 non-null float64
Net Debt                   23832 non-null float64
Sales                      25005 non-null float64
Total Assets               25100 non-null float64
EPS Growth 3M              24922 non-null float64
EPS Growth 12M             24733 non-null float64
Net Debt Growth 3M         23772 non-null float64
Net Debt Growth 12M        23595 non-null float64
Sales Growth 3M            24942 non-null float64
Sales Growth 12M           24753 non-null float64
Total Assets Growth 3M     25037 non-null float64
Total Assets Growth 12M    24848 non-null float64
dtypes: float64(12)
memory usage: 2.5+ MB

Quality Factors

In [28]:
class QualityFactors:
    
    @staticmethod
    def AssetTurnover(mask):
        """Sales divided by average of year beginning and year end assets"""

        assets = AnnualAvg(inputs=[Fundamentals.total_assets],
                           mask=mask)
        sales = AnnualizedData([Fundamentals.total_revenue_asof_date,
                                Fundamentals.total_revenue], mask=mask)
        return sales / assets
  
    @staticmethod
    def CurrentRatio(mask):
        """Total current assets divided by total current liabilities"""

        assets = Fundamentals.current_assets.latest
        liabilities = Fundamentals.current_liabilities.latest
        return assets / liabilities
    
    @staticmethod
    def AssetToEquityRatio(mask):
        """Total current assets divided by common equity"""

        assets = Fundamentals.current_assets.latest
        equity = Fundamentals.common_stock.latest
        return assets / equity    

    
    @staticmethod
    def InterestCoverage(mask):
        """EBIT divided by interest expense"""

        ebit = AnnualizedData(inputs = [Fundamentals.ebit_asof_date,
                                        Fundamentals.ebit], mask=mask)  
        
        interest_expense = AnnualizedData(inputs = [Fundamentals.interest_expense_asof_date,
                                        Fundamentals.interest_expense], mask=mask)
        return ebit / interest_expense

    @staticmethod
    def DebtToAssetRatio(mask):
        """Total Debts divided by Total Assets"""

        debt = Fundamentals.total_debt.latest
        assets = Fundamentals.total_assets.latest
        return debt / assets
    
    @staticmethod
    def DebtToEquityRatio(mask):
        """Total Debts divided by Common Stock Equity"""

        debt = Fundamentals.total_debt.latest
        equity = Fundamentals.common_stock.latest
        return debt / equity    

    @staticmethod
    def WorkingCapitalToAssets(mask):
        """Current Assets less Current liabilities (Working Capital) divided by Assets"""

        working_capital = Fundamentals.working_capital.latest
        assets = Fundamentals.total_assets.latest
        return working_capital / assets
 
    @staticmethod
    def WorkingCapitalToSales(mask):
        """Current Assets less Current liabilities (Working Capital), divided by Sales"""

        working_capital = Fundamentals.working_capital.latest
        sales = AnnualizedData([Fundamentals.total_revenue_asof_date,
                                Fundamentals.total_revenue], mask=mask)        
        return working_capital / sales          
       
        
    class MertonsDD(CustomFactor):
        """Merton's Distance to Default """
        
        inputs = [Fundamentals.total_assets,
                  Fundamentals.total_liabilities, 
                  libor.value, 
                  USEquityPricing.close]
        window_length = 252

        def compute(self, today, assets, out, tot_assets, tot_liabilities, r, close):
            mertons = []

            for col_assets, col_liabilities, col_r, col_close in zip(tot_assets.T, tot_liabilities.T,
                                                                     r.T, close.T):
                vol_1y = np.nanstd(col_close)
                numerator = np.log(
                        col_assets[-1] / col_liabilities[-1]) + ((252 * col_r[-1]) - ((vol_1y ** 2) / 2))
                mertons.append(numerator / vol_1y)

            out[:] = mertons            
In [29]:
QUALITY_FACTORS = {
    'AssetToEquityRatio'    : QualityFactors.AssetToEquityRatio,
    'AssetTurnover'         : QualityFactors.AssetTurnover,
    'CurrentRatio'          : QualityFactors.CurrentRatio,
    'DebtToAssetRatio'      : QualityFactors.DebtToAssetRatio,
    'DebtToEquityRatio'     : QualityFactors.DebtToEquityRatio,
    'InterestCoverage'      : QualityFactors.InterestCoverage,
    'MertonsDD'             : QualityFactors.MertonsDD,
    'WorkingCapitalToAssets': QualityFactors.WorkingCapitalToAssets,
    'WorkingCapitalToSales' : QualityFactors.WorkingCapitalToSales,
}
    
In [30]:
quality_factors, t = factor_pipeline(QUALITY_FACTORS)
print('Pipeline run time {:.2f} secs'.format(t))
quality_factors.info()
Running pipeline: 2017-01-01 00:00:00  -  2017-07-02 00:00:00
Pipeline Execution Time: 34.88 Seconds
Time to run this chunk of the pipeline 36.23 secs
Running pipeline: 2017-07-04 00:00:00  -  2018-01-02 00:00:00
Pipeline Execution Time: 32.94 Seconds
Time to run this chunk of the pipeline 34.32 secs
Running pipeline: 2018-01-03 00:00:00  -  2018-07-04 00:00:00
Pipeline Execution Time: 32.99 Seconds
Time to run this chunk of the pipeline 34.36 secs
Running pipeline: 2018-07-06 00:00:00  -  2018-12-31 00:00:00
Pipeline Execution Time: 32.61 Seconds
Time to run this chunk of the pipeline 33.97 secs
Time to run the entire pipeline 138.88 secs
Pipeline run time 138.89 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 9 columns):
AssetToEquityRatio        22464 non-null float64
AssetTurnover             24985 non-null float64
CurrentRatio              22464 non-null float64
DebtToAssetRatio          25080 non-null float64
DebtToEquityRatio         24551 non-null float64
InterestCoverage          20461 non-null float64
MertonsDD                 25100 non-null float64
WorkingCapitalToAssets    22464 non-null float64
WorkingCapitalToSales     22369 non-null float64
dtypes: float64(9)
memory usage: 1.9+ MB

Payout Factors

In [31]:
class PayoutFactors:

    @staticmethod
    def DividendPayoutRatio(mask):
        """Dividends Per Share divided by Earnings Per Share"""

        dps = AnnualizedData(inputs = [Fundamentals.dividend_per_share_earnings_reports_asof_date,
                                        Fundamentals.dividend_per_share_earnings_reports], mask=mask)  
        
        eps = AnnualizedData(inputs = [Fundamentals.basic_eps_earnings_reports_asof_date,
                                        Fundamentals.basic_eps_earnings_reports], mask=mask)
        return dps / eps
    
    @staticmethod
    def DividendGrowth(**kwargs):
        """Annualized percentage DPS change"""        
        return Fundamentals.dps_growth.latest    
In [32]:
PAYOUT_FACTORS = {
    'Dividend Payout Ratio': PayoutFactors.DividendPayoutRatio,
    'Dividend Growth': PayoutFactors.DividendGrowth
}
In [33]:
payout_factors, t = factor_pipeline(PAYOUT_FACTORS)
print('Pipeline run time {:.2f} secs'.format(t))
payout_factors.info()
Running pipeline: 2017-01-01 00:00:00  -  2017-07-02 00:00:00
Pipeline Execution Time: 6.58 Seconds
Time to run this chunk of the pipeline 7.93 secs
Running pipeline: 2017-07-04 00:00:00  -  2018-01-02 00:00:00
Pipeline Execution Time: 5.69 Seconds
Time to run this chunk of the pipeline 7.02 secs
Running pipeline: 2018-01-03 00:00:00  -  2018-07-04 00:00:00
Pipeline Execution Time: 5.63 Seconds
Time to run this chunk of the pipeline 7.05 secs
Running pipeline: 2018-07-06 00:00:00  -  2018-12-31 00:00:00
Pipeline Execution Time: 5.88 Seconds
Time to run this chunk of the pipeline 7.27 secs
Time to run the entire pipeline 29.27 secs
Pipeline run time 29.28 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 2 columns):
Dividend Growth          19558 non-null float64
Dividend Payout Ratio    19418 non-null float64
dtypes: float64(2)
memory usage: 588.3+ KB

Profitability Factors

In [34]:
class ProfitabilityFactors:
    
    @staticmethod
    def GrossProfitMargin(mask):
        """Gross Profit divided by Net Sales"""

        gross_profit = AnnualizedData([Fundamentals.gross_profit_asof_date,
                              Fundamentals.gross_profit], mask=mask)  
        sales = AnnualizedData([Fundamentals.total_revenue_asof_date,
                                Fundamentals.total_revenue], mask=mask)
        return gross_profit / sales   
    
    @staticmethod
    def NetIncomeMargin(mask):
        """Net income divided by Net Sales"""

        net_income = AnnualizedData([Fundamentals.net_income_income_statement_asof_date,
                              Fundamentals.net_income_income_statement], mask=mask)  
        sales = AnnualizedData([Fundamentals.total_revenue_asof_date,
                                Fundamentals.total_revenue], mask=mask)
        return net_income / sales   
In [35]:
PROFITABIILTY_FACTORS = {
    'Gross Profit Margin': ProfitabilityFactors.GrossProfitMargin,
    'Net Income Margin': ProfitabilityFactors.NetIncomeMargin,
    'Return on Equity': Fundamentals.roe.latest,
    'Return on Assets': Fundamentals.roa.latest,
    'Return on Invested Capital': Fundamentals.roic.latest
}
In [36]:
profitability_factors, t = factor_pipeline(PAYOUT_FACTORS)
print('Pipeline run time {:.2f} secs'.format(t))
payout_factors.info()
Running pipeline: 2017-01-01 00:00:00  -  2017-07-02 00:00:00
Pipeline Execution Time: 5.69 Seconds
Time to run this chunk of the pipeline 7.78 secs
Running pipeline: 2017-07-04 00:00:00  -  2018-01-02 00:00:00
Pipeline Execution Time: 5.78 Seconds
Time to run this chunk of the pipeline 7.20 secs
Running pipeline: 2018-01-03 00:00:00  -  2018-07-04 00:00:00
Pipeline Execution Time: 5.68 Seconds
Time to run this chunk of the pipeline 7.09 secs
Running pipeline: 2018-07-06 00:00:00  -  2018-12-31 00:00:00
Pipeline Execution Time: 6.63 Seconds
Time to run this chunk of the pipeline 8.04 secs
Time to run the entire pipeline 30.12 secs
Pipeline run time 30.13 secs
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 2 columns):
Dividend Growth          19558 non-null float64
Dividend Payout Ratio    19418 non-null float64
dtypes: float64(2)
memory usage: 588.3+ KB

Build Dataset

Get Returns

We will test predictions for various lookahead periods to identify the best holding periods that generate the best predictability, measured by the information coefficient.

More specifically, we compute returns for 1, 5, 10, 20 and 60 days using the built-in Returns function, resulting in over 25,000 observations for the universe of 100 stocks over two years (that include approximately 252 trading days each)

In [37]:
lookahead = [1, 5, 10, 20, 60]
returns = run_pipeline(Pipeline({'Returns{}D'.format(i): Returns(inputs=[USEquityPricing.close], 
                                          window_length=i+1, mask=UNIVERSE) for i in lookahead},
                                screen=UNIVERSE),
                       start_date=START, 
                       end_date=END)
return_cols = ['Returns{}D'.format(i) for i in lookahead]
returns.info()
Pipeline Execution Time: 12.53 Seconds
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 5 columns):
Returns10D    25100 non-null float64
Returns1D     25100 non-null float64
Returns20D    25100 non-null float64
Returns5D     25100 non-null float64
Returns60D    25100 non-null float64
dtypes: float64(5)
memory usage: 1.1+ MB

We will use over 50 features that cover a broad range of factors based on market, fundamental, and alternative data. The notebook also includes custom transformations to convert fundamental data that is typically available in quarterly reporting frequency to rolling annual totals or averages to avoid excessive season fluctuations.

Once the factors have been computed we combine them using pd.concat(), assign index names, and create a categorical variable that identifies the asset for each data point:

In [38]:
data = pd.concat([returns,
                 value_factors,
                 momentum_factors,
                 quality_factors,
                 payout_factors,
                 growth_factors,
                 efficiency_factors,
                 risk_factors], axis=1).sortlevel()
data.index.names = ['date', 'asset']
In [39]:
data['stock'] = data.index.get_level_values('asset').map(lambda x: x.asset_name)
data.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25100 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 56 columns):
Returns10D                     25100 non-null float64
Returns1D                      25100 non-null float64
Returns20D                     25100 non-null float64
Returns5D                      25100 non-null float64
Returns60D                     25100 non-null float64
DividendYield                  19739 non-null float64
EBITDAYield                    21929 non-null float64
EVToEBITDA                     21929 non-null float64
EVToFCF                        25005 non-null float64
PriceToBook                    25100 non-null float64
PriceToDilutedEarningsTTM      24985 non-null float64
PriceToEarningsTTM             24804 non-null float64
PriceToFCF                     25100 non-null float64
PriceToForwardEarnings         25080 non-null float64
PriceToOperatingCashflow       25100 non-null float64
PriceToSalesTTM                25100 non-null float64
Directional Movement Index     25100 non-null float64
Money Flow Index               25100 non-null float64
Percent Above Low              25018 non-null float64
Percent Below High             25018 non-null float64
Price Oscillator               25100 non-null float64
Trendline                      25018 non-null float64
AssetToEquityRatio             22464 non-null float64
AssetTurnover                  24985 non-null float64
CurrentRatio                   22464 non-null float64
DebtToAssetRatio               25080 non-null float64
DebtToEquityRatio              24551 non-null float64
InterestCoverage               20461 non-null float64
MertonsDD                      25100 non-null float64
WorkingCapitalToAssets         22464 non-null float64
WorkingCapitalToSales          22369 non-null float64
Dividend Growth                19558 non-null float64
Dividend Payout Ratio          19418 non-null float64
EPS                            24985 non-null float64
Net Debt                       23832 non-null float64
Sales                          25005 non-null float64
Total Assets                   25100 non-null float64
EPS Growth 3M                  24922 non-null float64
EPS Growth 12M                 24733 non-null float64
Net Debt Growth 3M             23772 non-null float64
Net Debt Growth 12M            23595 non-null float64
Sales Growth 3M                24942 non-null float64
Sales Growth 12M               24753 non-null float64
Total Assets Growth 3M         25037 non-null float64
Total Assets Growth 12M        24848 non-null float64
CFO To Assets                  25005 non-null float64
Capex To Assets                23566 non-null float64
Capex To FCF                   23566 non-null float64
Capex To Sales                 23566 non-null float64
EBIT To Assets                 22369 non-null float64
Retained Earnings To Assets    25005 non-null float64
Downside Risk                  25100 non-null float64
Index Beta                     25100 non-null float64
Log Market Cap                 25100 non-null float64
Volatility 3M                  25100 non-null float64
stock                          25100 non-null object
dtypes: float64(55), object(1)
memory usage: 10.9+ MB

Visualizing missing values

In [40]:
# Craete sorted dataframe of numeric_features with missing_count
missing_values0 = data.isnull().sum(axis=0).reset_index()
missing_values0.columns = ['column_name', 'missing_count']
missing_values0 = missing_values0.loc[missing_values0['missing_count']>0]
missing_values0 = missing_values0.sort_values(by='missing_count')
In [41]:
# Get percantage of total NaNs numeric_features
total0 = data.isnull().sum().sort_values(ascending=False)
percent0 = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data0 = pd.concat([total0, percent0], axis=1,join='outer', keys=['Total Missing Count', '% of Total Observations'])
missing_data0.index.name =' Numeric Feature'

missing_data0.head(len(data.columns))
Out[41]:
Total Missing Count % of Total Observations
Numeric Feature
Dividend Payout Ratio 5682 0.226375
Dividend Growth 5542 0.220797
DividendYield 5361 0.213586
InterestCoverage 4639 0.184821
EVToEBITDA 3171 0.126335
EBITDAYield 3171 0.126335
EBIT To Assets 2731 0.108805
WorkingCapitalToSales 2731 0.108805
AssetToEquityRatio 2636 0.105020
CurrentRatio 2636 0.105020
WorkingCapitalToAssets 2636 0.105020
Capex To Sales 1534 0.061116
Capex To FCF 1534 0.061116
Capex To Assets 1534 0.061116
Net Debt Growth 12M 1505 0.059960
Net Debt Growth 3M 1328 0.052908
Net Debt 1268 0.050518
DebtToEquityRatio 549 0.021873
EPS Growth 12M 367 0.014622
Sales Growth 12M 347 0.013825
PriceToEarningsTTM 296 0.011793
Total Assets Growth 12M 252 0.010040
EPS Growth 3M 178 0.007092
Sales Growth 3M 158 0.006295
EPS 115 0.004582
PriceToDilutedEarningsTTM 115 0.004582
AssetTurnover 115 0.004582
CFO To Assets 95 0.003785
Retained Earnings To Assets 95 0.003785
Sales 95 0.003785
EVToFCF 95 0.003785
Percent Below High 82 0.003267
Percent Above Low 82 0.003267
Trendline 82 0.003267
Total Assets Growth 3M 63 0.002510
DebtToAssetRatio 20 0.000797
PriceToForwardEarnings 20 0.000797
Returns1D 0 0.000000
Returns20D 0 0.000000
Returns5D 0 0.000000
PriceToBook 0 0.000000
PriceToFCF 0 0.000000
Returns60D 0 0.000000
stock 0 0.000000
PriceToOperatingCashflow 0 0.000000
PriceToSalesTTM 0 0.000000
Directional Movement Index 0 0.000000
Money Flow Index 0 0.000000
Price Oscillator 0 0.000000
Volatility 3M 0 0.000000
MertonsDD 0 0.000000
Total Assets 0 0.000000
Downside Risk 0 0.000000
Index Beta 0 0.000000
Log Market Cap 0 0.000000
Returns10D 0 0.000000
In [42]:
ind0 = np.arange(missing_values0.shape[0])
width0 = 0.1
fig, ax = plt.subplots(figsize=(13,5))
colors0 = sns.color_palette('Set2', len(ind0))
rects0 = ax.bar(ind0, missing_values0.missing_count.values, color=colors0)
ax.set_xticks(ind0)
ax.set_xticklabels(missing_values0.column_name.values, rotation='vertical')
ax.set_ylabel("Count")
ax.set_title("Missing Observations Count")
ax.margins(0.001)
plt.show()

Remove columns and rows with less than 80% of data availability

In a next step, we remove rows and columns that lack more than 20 percent of the observations, resulting in a loss of six percent of the observations and 5 columns:

In [43]:
rows_before, cols_before = data.shape
data = (data
        .dropna(axis=1, thresh=int(len(data)*.8))
        .dropna(thresh=int(len(data.columns) * .8)))
#data = data.fillna(data.median())
data = data.bfill().ffill()
rows_after, cols_after = data.shape
print('{:,d} rows and {:,d} columns dropped'.format(rows_before-rows_after, cols_before-cols_after))
1,571 rows and 3 columns dropped

At this point, we have 51 features and the categorical identifier of the stock:

In [44]:
data.sort_index(1).info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23529 entries, (2017-01-03 00:00:00+00:00, Equity(24 [AAPL])) to (2018-12-31 00:00:00+00:00, Equity(51157 [DD]))
Data columns (total 53 columns):
AssetToEquityRatio             23529 non-null float64
AssetTurnover                  23529 non-null float64
CFO To Assets                  23529 non-null float64
Capex To Assets                23529 non-null float64
Capex To FCF                   23529 non-null float64
Capex To Sales                 23529 non-null float64
CurrentRatio                   23529 non-null float64
DebtToAssetRatio               23529 non-null float64
DebtToEquityRatio              23529 non-null float64
Directional Movement Index     23529 non-null float64
Downside Risk                  23529 non-null float64
EBIT To Assets                 23529 non-null float64
EBITDAYield                    23529 non-null float64
EPS                            23529 non-null float64
EPS Growth 12M                 23529 non-null float64
EPS Growth 3M                  23529 non-null float64
EVToEBITDA                     23529 non-null float64
EVToFCF                        23529 non-null float64
Index Beta                     23529 non-null float64
InterestCoverage               23529 non-null float64
Log Market Cap                 23529 non-null float64
MertonsDD                      23529 non-null float64
Money Flow Index               23529 non-null float64
Net Debt                       23529 non-null float64
Net Debt Growth 12M            23529 non-null float64
Net Debt Growth 3M             23529 non-null float64
Percent Above Low              23529 non-null float64
Percent Below High             23529 non-null float64
Price Oscillator               23529 non-null float64
PriceToBook                    23529 non-null float64
PriceToDilutedEarningsTTM      23529 non-null float64
PriceToEarningsTTM             23529 non-null float64
PriceToFCF                     23529 non-null float64
PriceToForwardEarnings         23529 non-null float64
PriceToOperatingCashflow       23529 non-null float64
PriceToSalesTTM                23529 non-null float64
Retained Earnings To Assets    23529 non-null float64
Returns10D                     23529 non-null float64
Returns1D                      23529 non-null float64
Returns20D                     23529 non-null float64
Returns5D                      23529 non-null float64
Returns60D                     23529 non-null float64
Sales                          23529 non-null float64
Sales Growth 12M               23529 non-null float64
Sales Growth 3M                23529 non-null float64
Total Assets                   23529 non-null float64
Total Assets Growth 12M        23529 non-null float64
Total Assets Growth 3M         23529 non-null float64
Trendline                      23529 non-null float64
Volatility 3M                  23529 non-null float64
WorkingCapitalToAssets         23529 non-null float64
WorkingCapitalToSales          23529 non-null float64
stock                          23529 non-null object
dtypes: float64(52), object(1)
memory usage: 9.7+ MB

Data Exploration

First lets take a look at the individual distributions of all our data.

In [45]:
data.hist(bins=25, figsize=(22,22))
plt.show()

It is always a good idea to check the relationship between your features and target variable. Here we will look at a scatter plot of the 60 day target variable along with the p-value, r2 score and mean IC (information coefficient) for each feature.

In [50]:
tmp = data.drop(['Returns1D','Returns5D','Returns10D','Returns20D'], axis=1)
tmp.reset_index(level=['asset'], inplace=True, drop=True)
tmp.head()
Out[50]:
Returns60D EBITDAYield EVToEBITDA EVToFCF PriceToBook PriceToDilutedEarningsTTM PriceToEarningsTTM PriceToFCF PriceToForwardEarnings PriceToOperatingCashflow ... Capex To Assets Capex To FCF Capex To Sales EBIT To Assets Retained Earnings To Assets Downside Risk Index Beta Log Market Cap Volatility 3M stock
date
2017-01-03 00:00:00+00:00 0.029940 5.0 11.0 18.0 31.0 14.0 1063.0 16.0 1161.0 18.0 ... 29.0 22.0 28.0 40.0 26.0 24.0 3645.0 50.0 11.0 APPLE INC
2017-01-03 00:00:00+00:00 0.165562 34.0 24.0 15.0 50.0 35.0 2345.0 18.0 1833.0 16.0 ... 24.0 27.0 13.0 13.0 34.0 23.0 4675.0 17.0 16.0 BOEING CO
2017-01-03 00:00:00+00:00 0.066606 32.0 36.0 49.0 35.0 40.0 2788.0 49.0 2400.0 48.0 ... 25.0 44.0 26.0 31.0 47.0 6.0 2556.0 19.0 43.0 BRISTOL-MYERS SQUIBB CO
2017-01-03 00:00:00+00:00 0.099725 36.0 39.0 37.0 46.0 46.0 3390.0 40.0 1883.0 41.0 ... 12.0 12.0 14.0 27.0 28.0 10.0 5036.0 14.0 44.0 CELGENE CORP
2017-01-03 00:00:00+00:00 0.055036 13.0 10.0 39.0 21.0 28.0 1892.0 34.0 2227.0 14.0 ... 38.0 46.0 42.0 17.0 18.0 46.0 2461.0 29.0 14.0 COMCAST CORP

5 rows × 49 columns

In [47]:
def r2(x, y):
    return pearsonr(x, y)[0] ** 2
In [54]:
count = 0
for i, feature in enumerate(list(tmp), 1):
    count += 1
    
    if(feature == 'Returns60D'):
        print()
        
    else:
        print('{} # {}'.format(feature, count))
        plt.figure(figsize=(8,5))
        
        cm = plt.get_cmap('jet')
        colors = np.linspace(0.1, 1, len(tmp))
                             
        sc = plt.scatter(tmp[feature], tmp['Returns60D'], s=25, c=colors, cmap=cm, 
                 edgecolor='k', alpha=0.3, label='Price Data')
        
        j = sns.regplot(tmp[feature], tmp['Returns60D'], data=tmp, scatter=False, 
                line_kws={'color':'k','lw':2, 'linestyle':'dashed'})
    
        cb = plt.colorbar(sc)
        cb.ax.set_yticklabels([str(p) for p in tmp[::len(tmp)//9].index],
                         fontdict = {'fontsize': 10,
                                     'fontweight': 'medium'})
    
        plt.xlabel('{}'.format(feature), size=10, labelpad=10, fontsize=10, fontweight='medium')
        plt.ylabel('Returns60D', size=10, labelpad=10, fontsize=10, fontweight='medium')
        plt.grid(False)
        ic, pval = spearmanr(tmp[feature], tmp['Returns60D'])
        R2 = r2(tmp[feature], tmp['Returns60D'])
        plt.title('r2 = {}, IC = {}, P-Value = {}'.format(round(R2,4), round(ic,4), pval))
            
        for j in range(2):
            plt.tick_params(axis='x', labelsize=10)
            plt.tick_params(axis='y', labelsize=10)
            
        plt.show()
        
        if(count == len(tmp.columns)-1):
            break
EBITDAYield # 2
EVToEBITDA # 3