Scraping Stock Data#

This notebook download the stock data from Yahoo.

import pandas as pd
import io
import requests
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import yfinance as yf
import os
# get stock information to determine which sector it belongs to
aapl = yf.Ticker("AAPL")
aapl.info
{'zip': '95014',
 'sector': 'Technology',
 'fullTimeEmployees': 164000,
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. It also sells various related services. In addition, the company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. Further, it provides AppleCare support and cloud services store services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts. Additionally, the company offers various services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized fitness service; Apple Music, which offers users a curated listening experience with on-demand radio stations; Apple News+, a subscription news and magazine service; Apple TV+, which offers exclusive original content; Apple Card, a co-branded credit card; and Apple Pay, a cashless payment service, as well as licenses its intellectual property. The company serves consumers, and small and mid-sized businesses; and the education, enterprise, and government markets. It distributes third-party applications for its products through the App Store. The company also sells its products through its retail and online stores, and direct sales force; and third-party cellular network carriers, wholesalers, retailers, and resellers. Apple Inc. was incorporated in 1977 and is headquartered in Cupertino, California.',
 'city': 'Cupertino',
 'phone': '408 996 1010',
 'state': 'CA',
 'country': 'United States',
 'companyOfficers': [],
 'website': 'https://www.apple.com',
 'maxAge': 1,
 'address1': 'One Apple Park Way',
 'industry': 'Consumer Electronics',
 'ebitdaMargins': 0.33105,
 'profitMargins': 0.2531,
 'grossMargins': 0.43310001,
 'operatingCashflow': 122151002112,
 'revenueGrowth': 0.081,
 'operatingMargins': 0.30289,
 'ebitda': 130541002752,
 'targetLowPrice': 122,
 'recommendationKey': 'buy',
 'grossProfits': 170782000000,
 'freeCashflow': 90215251968,
 'targetMedianPrice': 180,
 'currentPrice': 147.81,
 'earningsGrowth': 0.048,
 'currentRatio': 0.879,
 'returnOnAssets': 0.21214001,
 'numberOfAnalystOpinions': 41,
 'targetMeanPrice': 178.15,
 'debtToEquity': 261.446,
 'returnOnEquity': 1.75459,
 'targetHighPrice': 214,
 'totalCash': 48304001024,
 'totalDebt': 132480000000,
 'totalRevenue': 394328014848,
 'totalCashPerShare': 3.036,
 'financialCurrency': 'USD',
 'revenuePerShare': 24.317,
 'quickRatio': 0.709,
 'recommendationMean': 2,
 'exchange': 'NMS',
 'shortName': 'Apple Inc.',
 'longName': 'Apple Inc.',
 'exchangeTimezoneName': 'America/New_York',
 'exchangeTimezoneShortName': 'EST',
 'isEsgPopulated': False,
 'gmtOffSetMilliseconds': '-18000000',
 'quoteType': 'EQUITY',
 'symbol': 'AAPL',
 'messageBoardId': 'finmb_24937',
 'market': 'us_market',
 'annualHoldingsTurnover': None,
 'enterpriseToRevenue': 6.176,
 'beta3Year': None,
 'enterpriseToEbitda': 18.657,
 '52WeekChange': -0.105915785,
 'morningStarRiskRating': None,
 'forwardEps': 6.81,
 'revenueQuarterlyGrowth': None,
 'sharesOutstanding': 15908100096,
 'fundInceptionDate': None,
 'annualReportExpenseRatio': None,
 'totalAssets': None,
 'bookValue': 3.178,
 'sharesShort': 114431895,
 'sharesPercentSharesOut': 0.0072000003,
 'fundFamily': None,
 'lastFiscalYearEnd': 1663977600,
 'heldPercentInstitutions': 0.59976,
 'netIncomeToCommon': 99802996736,
 'trailingEps': 6.11,
 'lastDividendValue': 0.23,
 'SandP52WeekChange': -0.11324203,
 'priceToBook': 46.510384,
 'heldPercentInsiders': 0.00071999995,
 'nextFiscalYearEnd': 1727136000,
 'yield': None,
 'mostRecentQuarter': 1663977600,
 'shortRatio': 1.23,
 'sharesShortPreviousMonthDate': 1665705600,
 'floatShares': 15891414476,
 'beta': 1.246644,
 'enterpriseValue': 2435555000320,
 'priceHint': 2,
 'threeYearAverageReturn': None,
 'lastSplitDate': 1598832000,
 'lastSplitFactor': '4:1',
 'legalType': None,
 'lastDividendDate': 1667520000,
 'morningStarOverallRating': None,
 'earningsQuarterlyGrowth': 0.008,
 'priceToSalesTrailing12Months': 5.9629955,
 'dateShortInterest': 1668470400,
 'pegRatio': 2.67,
 'ytdReturn': None,
 'forwardPE': 21.704845,
 'lastCapGain': None,
 'shortPercentOfFloat': 0.0072000003,
 'sharesShortPriorMonth': 108664761,
 'impliedSharesOutstanding': 0,
 'category': None,
 'fiveYearAverageReturn': None,
 'previousClose': 148.31,
 'regularMarketOpen': 145.96,
 'twoHundredDayAverage': 153.96796,
 'trailingAnnualDividendYield': 0.0060683703,
 'payoutRatio': 0.14729999,
 'volume24Hr': None,
 'regularMarketDayHigh': 148,
 'navPrice': None,
 'averageDailyVolume10Day': 67972610,
 'regularMarketPreviousClose': 148.31,
 'fiftyDayAverage': 145.7518,
 'trailingAnnualDividendRate': 0.9,
 'open': 145.96,
 'toCurrency': None,
 'averageVolume10days': 67972610,
 'expireDate': None,
 'algorithm': None,
 'dividendRate': 0.92,
 'exDividendDate': 1667520000,
 'circulatingSupply': None,
 'startDate': None,
 'regularMarketDayLow': 145.6501,
 'currency': 'USD',
 'trailingPE': 24.191488,
 'regularMarketVolume': 62231328,
 'lastMarket': None,
 'maxSupply': None,
 'openInterest': None,
 'marketCap': 2351376105472,
 'volumeAllCurrencies': None,
 'strikePrice': None,
 'averageVolume': 89886171,
 'dayLow': 145.6501,
 'ask': 147.46,
 'askSize': 1400,
 'volume': 62231328,
 'fiftyTwoWeekHigh': 182.94,
 'fromCurrency': None,
 'fiveYearAvgDividendYield': 0.99,
 'fiftyTwoWeekLow': 129.04,
 'bid': 147.5,
 'tradeable': False,
 'dividendYield': 0.0062,
 'bidSize': 1100,
 'dayHigh': 148,
 'coinMarketCapLink': None,
 'regularMarketPrice': 147.81,
 'preMarketPrice': None,
 'logo_url': 'https://logo.clearbit.com/apple.com',
 'trailingPegRatio': None}
# Technology 10
technology_tickers = ["AAPL","MSFT","NVDA","TSM","ORCL","ASML","AVGO","CSCO","ACN","IBM"]
technology_dict = {}
for t in technology_tickers:
    technology_dict[t] = yf.download(tickers=t, period='5y')
    technology_dict[t].drop(columns=["Adj Close"], inplace = True)
    technology_dict[t].insert(0, "Company",t)
    technology_dict[t].insert(1, "Sector", "Technology")
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

technology_dict["AAPL"]
Company Sector Open High Low Close Volume
Date
2017-12-04 AAPL Technology 43.119999 43.154999 42.407501 42.450001 130169600
2017-12-05 AAPL Technology 42.264999 42.880001 42.099998 42.410000 109400800
2017-12-06 AAPL Technology 41.875000 42.549999 41.615002 42.252499 114240000
2017-12-07 AAPL Technology 42.257500 42.610001 42.227501 42.330002 102693200
2017-12-08 AAPL Technology 42.622501 42.750000 42.205002 42.342499 93420800
... ... ... ... ... ... ... ...
2022-11-28 AAPL Technology 145.139999 146.639999 143.380005 144.220001 69246000
2022-11-29 AAPL Technology 144.289993 144.809998 140.350006 141.169998 83763800
2022-11-30 AAPL Technology 141.399994 148.720001 140.550003 148.029999 111224400
2022-12-01 AAPL Technology 148.210007 149.130005 146.610001 148.309998 71250400
2022-12-02 AAPL Technology 145.960007 148.000000 145.649994 147.809998 65421400

1259 rows × 7 columns

# Healthcare 10
healthcare_tickers = ["UNH","JNJ","LLY","PFE","ABBV","MRK","NVO","TMO","DHR","AZN"]
healthcare_dict = {}
for t in healthcare_tickers:
    healthcare_dict[t] = yf.download(tickers=t, period='5y')
    healthcare_dict[t].drop(columns=["Adj Close"], inplace = True)
    healthcare_dict[t].insert(0, "Company",t)
    healthcare_dict[t].insert(1, "Sector", "Healthcare")
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

healthcare_dict["UNH"]
Company Sector Open High Low Close Volume
Date
2017-12-04 UNH Healthcare 229.259995 231.770004 221.399994 221.419998 5591500
2017-12-05 UNH Healthcare 221.000000 223.720001 219.160004 220.089996 3246300
2017-12-06 UNH Healthcare 221.020004 221.960007 219.029999 219.940002 2397500
2017-12-07 UNH Healthcare 220.589996 220.610001 218.190002 220.149994 2661600
2017-12-08 UNH Healthcare 220.550003 224.330002 220.220001 223.910004 2233300
... ... ... ... ... ... ... ...
2022-11-28 UNH Healthcare 535.830017 539.580017 531.690002 532.270020 2699900
2022-11-29 UNH Healthcare 531.669983 532.969971 523.650024 528.000000 3751000
2022-11-30 UNH Healthcare 529.719971 548.080017 527.479980 547.760010 9688200
2022-12-01 UNH Healthcare 552.359985 553.000000 535.799988 536.909973 2997800
2022-12-02 UNH Healthcare 532.830017 538.609985 532.440002 536.159973 2345700

1259 rows × 7 columns

# Consumer Cyclical 12
cyclical_tickers = ["AMZN","TSLA","HD","MCD","TM","BABA","NKE","LOW","SBUX","ABNB","LULU","EBAY"]
cyclical_dict = {}
for t in cyclical_tickers:
    cyclical_dict[t] = yf.download(tickers=t, period='5y')
    cyclical_dict[t].drop(columns=["Adj Close"], inplace = True)
    cyclical_dict[t].insert(0, "Company",t)
    cyclical_dict[t].insert(1, "Sector", "Consumer Cyclical")
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

cyclical_dict["AMZN"]
Company Sector Open High Low Close Volume
Date
2017-12-04 AMZN Consumer Cyclical 58.692501 58.759998 56.400002 56.697498 118638000
2017-12-05 AMZN Consumer Cyclical 56.412998 57.963501 56.237000 57.078499 81596000
2017-12-06 AMZN Consumer Cyclical 56.899502 57.794498 56.804001 57.617500 57066000
2017-12-07 AMZN Consumer Cyclical 57.829498 58.159500 57.549999 57.989498 50232000
2017-12-08 AMZN Consumer Cyclical 58.520000 58.639500 57.855000 58.099998 61002000
... ... ... ... ... ... ... ...
2022-11-28 AMZN Consumer Cyclical 93.930000 96.400002 93.430000 93.949997 74943100
2022-11-29 AMZN Consumer Cyclical 94.040001 94.410004 91.440002 92.419998 65567300
2022-11-30 AMZN Consumer Cyclical 92.470001 96.540001 91.529999 96.540001 102628200
2022-12-01 AMZN Consumer Cyclical 96.989998 97.230003 94.919998 95.500000 68488000
2022-12-02 AMZN Consumer Cyclical 94.480003 95.360001 93.779999 94.129997 72427000

1259 rows × 7 columns

# Industrials 2
industrials_tickers = ["RTX","BA"]
industrials_dict = {}
for t in industrials_tickers:
    industrials_dict[t] = yf.download(tickers=t, period='5y')
    industrials_dict[t].drop(columns=["Adj Close"], inplace = True)
    industrials_dict[t].insert(0, "Company",t)
    industrials_dict[t].insert(1, "Sector", "Industrials")
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

industrials_dict["RTX"]
Company Sector Open High Low Close Volume
Date
2017-12-04 RTX Industrials 76.261803 76.733795 75.531784 75.544365 4380714
2017-12-05 RTX Industrials 76.771553 77.111389 75.645058 75.701698 5004238
2017-12-06 RTX Industrials 76.249214 76.400253 75.752045 76.274384 4143159
2017-12-07 RTX Industrials 76.349907 77.300186 76.186279 77.029579 5733589
2017-12-08 RTX Industrials 77.029579 77.463814 76.941475 77.287605 4766523
... ... ... ... ... ... ... ...
2022-11-28 RTX Industrials 97.070000 98.000000 96.459999 96.900002 4662300
2022-11-29 RTX Industrials 96.800003 97.910004 96.610001 97.430000 2916100
2022-11-30 RTX Industrials 97.519997 98.839996 96.040001 98.720001 6189800
2022-12-01 RTX Industrials 99.120003 99.959999 98.470001 99.500000 4084200
2022-12-02 RTX Industrials 98.680000 101.150002 98.680000 101.010002 4338300

1259 rows × 7 columns

# Financial Services 8
financial_tickers = ["JPM","BAC","GS","MS","V","HSBC","C","UBS"]
financial_dict = {}
for t in financial_tickers:
    financial_dict[t] = yf.download(tickers=t, period='5y')
    financial_dict[t].drop(columns=["Adj Close"], inplace = True)
    financial_dict[t].insert(0, "Company",t)
    financial_dict[t].insert(1, "Sector", "Financial Services")
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

financial_dict["C"]
Company Sector Open High Low Close Volume
Date
2017-12-04 C Financial Services 77.019997 77.919998 76.620003 77.099998 23151400
2017-12-05 C Financial Services 76.980003 77.279999 76.190002 76.540001 16045900
2017-12-06 C Financial Services 76.239998 76.500000 75.089996 75.440002 16612500
2017-12-07 C Financial Services 74.669998 75.320000 74.260002 74.980003 19173400
2017-12-08 C Financial Services 75.059998 75.720001 74.970001 75.709999 12460900
... ... ... ... ... ... ... ...
2022-11-28 C Financial Services 47.910000 48.189999 46.830002 47.230000 18987100
2022-11-29 C Financial Services 47.230000 47.900002 47.209999 47.570000 12224700
2022-11-30 C Financial Services 47.080002 48.430000 46.189999 48.410000 23990800
2022-12-01 C Financial Services 48.279999 48.669998 47.349998 47.779999 19797600
2022-12-02 C Financial Services 47.279999 47.450001 46.810001 47.240002 12742700

1259 rows × 7 columns

# Communication Services 5
communication_tickers = ["GOOG","META","NFLX","BIDU","DIS"]
communication_dict = {}
for t in communication_tickers:
    communication_dict[t] = yf.download(tickers=t, period='5y')
    communication_dict[t].drop(columns=["Adj Close"], inplace = True)
    communication_dict[t].insert(0, "Company",t)
    communication_dict[t].insert(1, "Sector", "Communication Services")
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

communication_dict["GOOG"]
Company Sector Open High Low Close Volume
Date
2017-12-04 GOOG Communication Services 50.632999 50.805000 49.778500 49.933998 38128000
2017-12-05 GOOG Communication Services 49.797001 51.030499 49.414001 50.257500 41346000
2017-12-06 GOOG Communication Services 50.075001 51.248501 50.056999 50.918999 25440000
2017-12-07 GOOG Communication Services 51.021500 51.712002 50.903549 51.546501 29164000
2017-12-08 GOOG Communication Services 51.874500 52.102501 51.626099 51.852501 25816000
... ... ... ... ... ... ... ...
2022-11-28 GOOG Communication Services 97.199997 97.830002 95.889999 96.250000 19974500
2022-11-29 GOOG Communication Services 96.000000 96.389999 94.389999 95.440002 20220000
2022-11-30 GOOG Communication Services 95.120003 101.449997 94.669998 101.449997 39888100
2022-12-01 GOOG Communication Services 101.400002 102.589996 100.669998 101.279999 21771500
2022-12-02 GOOG Communication Services 99.370003 101.150002 99.169998 100.830002 18812200

1259 rows × 7 columns

# Energy 4
energy_tickers = ["XOM","CVX","SHEL","COP"]
energy_dict = {}
for t in energy_tickers:
    energy_dict[t] = yf.download(tickers=t, period='5y')
    energy_dict[t].drop(columns=["Adj Close"], inplace = True)
    energy_dict[t].insert(0, "Company",t)
    energy_dict[t].insert(1, "Sector", "Energy")
[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed

energy_dict["SHEL"]
Company Sector Open High Low Close Volume
Date
2017-12-04 SHEL Energy 64.099998 64.470001 63.959999 64.150002 4026783
2017-12-05 SHEL Energy 64.150002 64.269997 63.830002 63.990002 2357710
2017-12-06 SHEL Energy 63.639999 63.849998 63.360001 63.500000 2487625
2017-12-07 SHEL Energy 63.169998 63.590000 63.099998 63.290001 2931379
2017-12-08 SHEL Energy 63.290001 63.520000 63.099899 63.459999 2812618
... ... ... ... ... ... ... ...
2022-11-28 SHEL Energy 56.490002 57.055000 56.255001 56.490002 3806189
2022-11-29 SHEL Energy 57.270000 58.029999 57.270000 57.619999 4289617
2022-11-30 SHEL Energy 58.470001 58.740002 57.856201 58.470001 5377406
2022-12-01 SHEL Energy 58.689999 58.770000 57.910000 58.049999 4084642
2022-12-02 SHEL Energy 57.720001 58.209999 57.485001 57.720001 3510347

1259 rows × 7 columns

data = pd.concat(list(technology_dict.values())
          +list(healthcare_dict.values())
          +list(cyclical_dict.values())
          +list(industrials_dict.values())
          +list(financial_dict.values())
          +list(communication_dict.values())
          +list(energy_dict.values()))
data
Company Sector Open High Low Close Volume
Date
2017-12-04 AAPL Technology 43.119999 43.154999 42.407501 42.450001 130169600
2017-12-05 AAPL Technology 42.264999 42.880001 42.099998 42.410000 109400800
2017-12-06 AAPL Technology 41.875000 42.549999 41.615002 42.252499 114240000
2017-12-07 AAPL Technology 42.257500 42.610001 42.227501 42.330002 102693200
2017-12-08 AAPL Technology 42.622501 42.750000 42.205002 42.342499 93420800
... ... ... ... ... ... ... ...
2022-11-28 COP Energy 122.699997 125.059998 121.239998 123.839996 5648600
2022-11-29 COP Energy 125.820000 126.830002 123.919998 124.250000 4909700
2022-11-30 COP Energy 126.650002 126.680000 122.440002 123.510002 8480700
2022-12-01 COP Energy 124.669998 125.370003 122.889999 123.059998 5285100
2022-12-02 COP Energy 122.120003 123.870003 121.339996 122.180000 4879400

63449 rows × 7 columns

data.isnull().sum()
Company    0
Sector     0
Open       0
High       0
Low        0
Close      0
Volume     0
dtype: int64

Don’t save it if data already exists:

filepath = "../../data/stocks.csv"
if not os.path.exists(filepath):
    data.to_csv("../../data/stocks.csv")