VAERS is the Vaccine Adverse Event Reporting System. This is a SELF reporting system for vaccine injury in the USA. The link below contains a portal where zip files can be downloaded of all adverse events recorded since the introduction of the system in 1990.
The files are CSV’s and are joined using the code in this project. A few preliminary investigations and graphics are shown but the intent of this project is to serve as a starting place for others that want to perform their own analytics with the joined data.
Have fun and let me know of any questions as I can be reached via email. Or if you need the full notebook let me know and I can send it along. The repo needs to be cleaned up so I am not prepared to make it public.
Just some includes and constants.
# Imports
import numpy as np
import pandas as pd
import pyodbc as db
import matplotlib.pyplot as plt
import os
%matplotlib inline
import requests
from requests.auth import HTTPBasicAuth
from io import StringIO
from sodapy import Socrata
from datetime import date
import datetime
import seaborn as sns
import configparser
# Constants
stringVAERSDATA = "VAERSDATA"
stringVAERSSYMPTOMS = "VAERSSYMPTOMS"
stringVAERSVAX = "VAERSVAX"
inputDirectory = '.\RAW_DATA'
reworkedInputs = '.\REWORKED_DATA'
outputDirectory = '.\OUTPUT'
pklInputDirectory = '.\INPUT'
usePickle = 0
This cell takes all the files and concatenates them together based on the key attribute ID. Note multiple entries can exist for the symptoms table for a single AE, hence the outer join. As s todo the data types need to be populated for each of the tables.
# Locals
dfVAERSDATA_Initialized = False
dfVAERSSYMPTOMS_Initialized = False
dfVAERSVAX_Initialized = False
# @TODO Column datatypes - need to finish - getting the following warning when reading in files:
# DtypeWarning: Columns (11,13,15,16,27) have mixed types.Specify dtype option on import or set
# low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
dtype_VAERSDATA = {"VAERS_ID": np.int64, "RECVDATE": np.object, "STATE": np.str, "AGE_YRS": np.float64, \
"CAGE_YR": np.float64, "CAGE_MO": np.float64, "SEX": np.str, "RPT_VDATE": np.object, \
"SYMPTOM_TEXT": np.str, "DIED": np.str, "DATEDIED": np.object, "L_THREAD": np.str, \
"ER_VISIT": np.str}
# Iterate over files
for filename in os.listdir(inputDirectory):
# Create the file name
fullFilename = os.path.join(inputDirectory, filename)
# Ensure its a file
if os.path.isfile(fullFilename):
# Print the file name to see where it breaks - if needed
# print(fullFilename)
# Overall
if(fullFilename.__contains__(stringVAERSDATA)):
# Check if the dataframe has been initialized
if(dfVAERSDATA_Initialized == False):
# Read in the file to this dataframe directly
dfVAERSDATA = pd.read_csv(fullFilename, encoding='cp1252', dtype=dtype_VAERSDATA)
# Set to init
dfVAERSDATA_Initialized = True
else:
# Add the new file to the dataframe that has been init'd
dfVAERSDATA = pd.concat([dfVAERSDATA, pd.read_csv(fullFilename, encoding='cp1252', dtype=dtype_VAERSDATA)], ignore_index=True)
# Symptoms
elif(fullFilename.__contains__(stringVAERSSYMPTOMS)):
# Check if the dataframe has been initialized
if(dfVAERSSYMPTOMS_Initialized == False):
# Read in the file to this dataframe directly
dfVAERSSYMPTOMS = pd.read_csv(fullFilename, encoding='cp1252')
# Set to init
dfVAERSSYMPTOMS_Initialized = True
else:
# Add the new file to the dataframe that has been init'd
dfVAERSSYMPTOMS = pd.concat([dfVAERSSYMPTOMS, pd.read_csv(fullFilename, encoding='cp1252')], ignore_index=True)
# Vax
elif(fullFilename.__contains__(stringVAERSVAX)):
# Check if the dataframe has been initialized
if(dfVAERSVAX_Initialized == False):
# Read in the file to this dataframe directly
dfVAERSVAX = pd.read_csv(fullFilename, encoding='cp1252')
# Set to init
dfVAERSVAX_Initialized = True
else:
# Add the new file to the dataframe that has been init'd
dfVAERSVAX = pd.concat([dfVAERSVAX, pd.read_csv(fullFilename, encoding='cp1252')], ignore_index=True)
# Merge the vax with the symptoms tables have duplicates or can
# have duplicates hence using full outer join. Merging vax and symptoms first.
completeDataFrame = pd.merge(left = dfVAERSVAX, right = dfVAERSSYMPTOMS, \
how = "outer", left_on = ["VAERS_ID"], \
right_on = ["VAERS_ID"])
# Now merge the data frame into the merged frame
completeDataFrame = pd.merge(left = completeDataFrame, right = dfVAERSDATA, \
how = "outer", left_on = ["VAERS_ID"], \
right_on = ["VAERS_ID"])
# Create a pickle
completeDataFrame.to_pickle(os.path.join(outputDirectory, "completeDataFrame.pkl"))
# Performance notes:
# Takes about 100 seconds on Dell Precision 752 with Intel(R)
# Core(TM) i7-6820HQ CPU @ 2.70GHz, 2701 Mhz, 4 Core(s), 8 Logical Processor(s)
A bit of pre-processing to clean up the frame and add age buckets.
# Return age bucket for age passed in
def ageRangeFromAge(inAge):
# Check ranges
if(inAge >= 100):
return "GTE_100"
elif(inAge >= 90):
return "90_99"
elif(inAge >= 80):
return "80_89"
elif(inAge >= 70):
return "70_79"
elif(inAge >= 60):
return "60_69"
elif(inAge >= 50):
return "50_59"
elif(inAge >= 40):
return "40_49"
elif(inAge >= 30):
return "30_39"
elif(inAge >= 20):
return "20_29"
elif(inAge >= 10):
return "10_19"
elif(inAge >= 0):
return "0_9"
else:
return "UNKNOWN"
# Fill in the unknowns with negatives for age
completeDataFrame["AGE_YRS"] = completeDataFrame["AGE_YRS"].fillna(-1)
# Fill in the unknowns with negatives for num days
completeDataFrame["NUMDAYS"] = completeDataFrame["NUMDAYS"].fillna(completeDataFrame["NUMDAYS"].mode())
# Add a feature for age range
completeDataFrame["AGE_RANGE"] = completeDataFrame["AGE_YRS"].apply(ageRangeFromAge)
# Make a copy so we can leave the original untouched
workingVaxFrame = completeDataFrame.copy(deep=True)
This cell uses a free socrata API to pull back the cumulative distribution and administered for each vaccine manufacturer for COVID-19 vax only.
# Taken directly from
# https://dev.socrata.com/foundry/data.cdc.gov/unsk-b7fc
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)
# First 50000 results, returned as JSON from API / converted to Python list of dictionaries by sodapy.
getResults = client.get("unsk-b7fc", limit=50000)
# Convert to pandas DataFrame
administrationDataFrame = pd.DataFrame.from_records(getResults)
# Convert to date time
administrationDataFrame["date"] = pd.to_datetime(administrationDataFrame["date"]).dt.date
# Make a copy so we can leave the original untouched
workingAdministrationDataFrame = administrationDataFrame.copy(deep=True)
# Only return the latest date
todaysAdministered = workingAdministrationDataFrame[workingAdministrationDataFrame.date == workingAdministrationDataFrame.date.max()]
# Print to file
administrationDataFrame.to_csv(os.path.join(outputDirectory, "administrationDataFrame.csv"))
# Check data is present
todaysAdministered.shape
Finally we can calculate the total deaths per manufacturer as well as the total deaths as of the date of this post. These stats are for the COVID vax only.
# Get all the values of the type of vaccine to try and find the covid
covidFrame = workingVaxFrame[workingVaxFrame["VAX_TYPE"] == "COVID19"]
# Get all the deaths per manufacturer
covidDeathsFramePerMfg = (covidFrame[covidFrame["DIED"] == "Y"].drop_duplicates(subset=["VAERS_ID"]))["VAX_MANU"].value_counts().to_frame()
# Convert to a float
covidDeathsFramePerMfg["VAX_MANU"] = covidDeathsFramePerMfg["VAX_MANU"].astype("float")
# Show the total events
print("TOTAL DEATHS:", covidDeathsFramePerMfg["VAX_MANU"].sum().astype("int"))
# Show graphs of deaths per million for each mfg
covidDeathsFramePerMfg["VAX_MANU"].plot(kind = "bar", title = "Total Death Count vs. Manufacturer")
