
## R code to pull data for IMAGES georeferencing project, start matching and summarizing linkage, NACS response, etc.
# R version 4.0.2 - 07/16/2020 CRAN mirror
library(DBI) # for function dbConnect
library(tidyverse) # for data manipulation
library(ggplot2) # for plotting
library(stringr) # for data manipulation
library(sf) # for spatial data manipulation
library(tidycensus) # for Census data download
library(fuzzyjoin) # used to access stringdist

# Census API key for Census data download
census_api_key(000) # Redacted - for individual use by corresponding author. See tidycensus package page for details on getting an API key.
# Username and password for data queries
user <-.rs.askForPassword("Please enter your username:")  # Dialog box for username    
psswd <-.rs.askForPassword("Please enter your password:") # Dialog box for password


#### Preliminary exploration of IMAGES project raw data ####
georef_data <- read.csv("SpecStates_CoreLogic.csv") # Read in CSB data

# Pull in AR, IA to compare
georef_data_new_ar <- readxl::read_xlsx("CoreLogic_update_AR.xlsx") # Read in CSB data
georef_data_new_ia <- readxl::read_xlsx("CoreLogic_update_IA.xlsx") # Read in CSB data

# Remove first column of georef_data, which has a weird column name
# Then add FID_1 and VALUE columns to AR, IA (don't need the values *in* these columns)
georef_data <- georef_data[,-1]
georef_data_new_ar$FID_1 <- NA
georef_data_new_ar$VALUE <- NA
georef_data_new_ia$FID_1 <- NA
georef_data_new_ia$VALUE <- NA

# Change order of columns to match between AR, IA, georef_data
georef_data_new_ar <- georef_data_new_ar[,names(georef_data)]
georef_data_new_ia <- georef_data_new_ia[,names(georef_data)]

# Add in AR, removing old AR data since it overlaps completely
georef_data <- georef_data[-which(georef_data$STATE=="AR"),]
georef_data <- rbind(georef_data, georef_data_new_ar)

# I will not add in Iowa until we can tell apart CSBs and hand-digitized polygons
overlapping_ia_geoids <- intersect(georef_data$NASSGEOID, georef_data_new_ia$NASSGEOID)

## This detects NASSGEOIDs that should have a calendar year after 1999 after the first 2 digits,
## indicating manual ID - e.g., "552021etc." rather than "551219etc."
temp_manual_status <- str_detect(georef_data$NASSGEOID, "^\\d{2}20\\d{2}")
georef_data$manual <- as.numeric(temp_manual_status)
table(table(georef_data$NASSGEOID)) # Shows how many NASSGEOID's occur 1, 2, etc. times in data
length(which(duplicated(georef_data$PARCEL_ID))) # No duplicates
length(which(duplicated(georef_data$NASSGEOID))) # That's a lot of duplicates - multiple parcels in a NASSGEOID

## Get rid of parcels with no NASSGEOID
georef_data_nageoid <- georef_data[which(is.na(georef_data$NASSGEOID)),]
georef_data <- georef_data[-which(is.na(georef_data$NASSGEOID)),]
georef_data <- georef_data[-which(georef_data$STATE=="MT"),]

# This table shows the proportion of manual IDs for each county (this is at the parcel level, not the NASSGEOID level)
manual_by_fips <- table(georef_data$FIPS_CODE, georef_data$manual)/rowSums(table(georef_data$FIPS_CODE, georef_data$manual))

## Fill in missing FIPS codes - this adds state code and any zeroes necessary on county codes (e.g., 39 becomes 039 for counties)
# First strip leading 0's (one or more at the start of the string) from some/all AR county codes
georef_data$CNTY_CODE <- str_replace(georef_data$CNTY_CODE, "^0+", "")

missing_fips <- paste0(georef_data$STATE_CODE,
                       ifelse(as.numeric(georef_data$CNTY_CODE)<10, paste0("00", georef_data$CNTY_CODE),
                              ifelse(as.numeric(georef_data$CNTY_CODE)>=10 & as.numeric(georef_data$CNTY_CODE)<100, paste0("0", georef_data$CNTY_CODE), georef_data$CNTY_CODE)))
georef_data$FIPS_CODE <- ifelse(is.na(georef_data$FIPS_CODE), missing_fips, georef_data$FIPS_CODE) # Checked, this results in 0 missing FIPS codes


# Pull in metro area data
metro_areas <- read.csv("cbsa2fipsxw.csv")
metro_areas$FULL_FIPS <- paste0(ifelse(metro_areas$fipsstatecode<10, paste0("0", metro_areas$fipsstatecode), metro_areas$fipsstatecode),
                                ifelse(as.numeric(metro_areas$fipscountycode)<10, paste0("00", metro_areas$fipscountycode),
                                       ifelse(as.numeric(metro_areas$fipscountycode)>=10 & as.numeric(metro_areas$fipscountycode)<100, paste0("0", metro_areas$fipscountycode), metro_areas$fipscountycode)))

metro_areas <- metro_areas %>% select(FULL_FIPS, metropolitanmicropolitanstatis, centraloutlyingcounty)

georef_data <- left_join(georef_data, metro_areas, by=c("FIPS_CODE" = "FULL_FIPS"))

# Amish
# Amish - would be good to add earlier, but here is fine for now
amish_pops <- read.csv("amish_pops.csv")
names(amish_pops)[1] <- "State"
data("fips_codes")
fips_codes$county_name <- str_remove(fips_codes$county, " County")
fips_codes$FULL_FIPS <- paste0(fips_codes$state_code, fips_codes$county_code)
fips_codes <- fips_codes %>% select(state_name, county_name, FULL_FIPS)
amish_pops <- left_join(amish_pops, fips_codes, by=c("State" = "state_name", "County" = "county_name"))
#amish_pops <- amish_pops %>% filter(!is.na(FULL_FIPS))  # Not necessary here as none of the counties in the spec states' CoreLogic were missing - only necessary in non-spec states - ditto with metro areas
georef_data$amish_score_settlement <- ifelse(georef_data$FIPS_CODE %in% amish_pops$FULL_FIPS, 1, 0)

#### Set up query and database connection to pull list frame data ####
# Read in state project names - these are the same for most but not all states
state_project_names <- readxl::read_xlsx("//client/P$/ALL/reclink/georef/project names.xlsx")
# Create data frames to add states' list frame data to 
checkdf <- data.frame(stateId=0, poId=0, wholeName=0, operID=0, addrDelivery=0, addrDelivery=0, placeName=0, stateAlpha=0, zip5=0, otherLink=0, activestatusId=0) # Add column names as necessary
checkdf2 <- data.frame(stateId=0, poId=0, wholeName=0, operID=0, addrDelivery=0, addrDelivery=0, placeName=0, stateAlpha=0, zip5=0, otherLink=0, newAddFlag=0, projectName=0)
for(i in statefipsvec){ # Using state FIPS code vector that was used to define border counties
  print(i)
  ## Current query: pull stateID (FIPS code), poid, name, address, town, state (letters), zip, in this case parcel ID for CoreLogic, active status
  ## for matching state, frame, and records from NASS databases for each state
  # Note: this query is slightly different for each state - stateId varies over the for loop
  temp_query <- 000 ### Removed due to internal NASS database confidentiality
  
  channel <<- 000 ### Removed due to internal NASS database confidentiality
  
  tempdf <- dbGetQuery(channel, temp_query)
  
  dbDisconnect(channel) # Disconnect
  names(checkdf) <- names(tempdf) # Make sure names match so you can add these states to the list
  
  checkdf <- rbind(checkdf, tempdf) # Add this state to the data frame
  # This creates a vector of the state project names for the current state
  projnames <- state_project_names %>% filter(as.numeric(Stateid)==as.numeric(i)) %>% select(Projectname) %>% unlist()
  ## Query that includes NML/criteria records
  for(j in projnames){
    # This query varies by state and project name
    temp_query2 <- ### Removed due to internal NASS database confidentiality
  
    channel2 <<- ### Removed due to internal NASS database confidentiality
  
    
    tempdf2 <- dbGetQuery(channel2, temp_query2)
    
    dbDisconnect(channel2)
    names(checkdf2) <- names(tempdf2) # Make names the same
    checkdf2 <- rbind(checkdf2, tempdf2) # Add current state and project name to data
  } # loop over project names within states
} # loop over states

checkdf <- checkdf[-1,] # Get rid of blank first rows
checkdf2 <- checkdf2[-1,]

checkdf <- checkdf[,-5] # Remove extra addrDelivery column
checkdf2 <- checkdf2[, -5] # Remove extra addrDelivery column

# We want every distinct combination of poid, otherLink, and (for criteria records) newAddFlag without duplicates
checkdf <- checkdf %>% distinct(poId, otherLink, .keep_all=TRUE)
checkdf2 <- checkdf2 %>% distinct(poId, otherLink, newAddFlag, .keep_all=TRUE)

# Fill in missing newAddFlag values (i.e., same record, but two different rows with different newAddFlag values), then get rid of any duplicate poid/otherLink combos in the criteria record query (checkdf2)
checkdf2 <- checkdf2 %>% group_by(poId, otherLink) %>% fill(newAddFlag, .direction = "down") %>% fill(newAddFlag, .direction="up") %>% ungroup()
checkdf2 <- checkdf2 %>% distinct(poId, otherLink, newAddFlag, .keep_all=TRUE)

checkdf_final <- full_join(checkdf, checkdf2) # Combine newAddFlag with list frame data

#### Add in NACS response data for respondents - acres owned/rented, operator demographics (some, not years farming though), crops, livestock etc.
checkdf4 <- data.frame(State_poid=0, Current_status_code=0, Varname=0, Current_value_num=0)
for(i in statefipsvec){
  print(i)
  temp <- read.csv(paste0("nacs_response_data_cornsoy_spec/newtest", i ,".csv"))
  checkdf4 <- rbind(checkdf4, temp)
}
checkdf4 <- checkdf4[-1,] # Remove blank first row

checkdf4$State_poid <- as.character(checkdf4$State_poid)

# Reshape long to wide
tempthing <- checkdf4 %>% pivot_wider(id_cols=c("State_poid", "Current_status_code"), names_from=Varname, values_from=Current_value_num, values_fill=NA)
checkdf4 <- tempthing # So much item nonresponse. So much.

# Select certain columns. These correspond to item codes that can be seen on NACS questionnaires, which are available to be viewed at https://www.nass.usda.gov/Surveys/Guide_to_NASS_Surveys/NACS/index.php
checkdf4 <- checkdf4 %>% select('State_poid', 'Current_status_code', 'K111', 'K112', 'K113', 'K900', 'K300', 'K345', 'K400', 'K689', 'K202', 'K939', 'K610', 'K615', 'K630', 'K640', 'K645', 'K675', 'K767', 'K892', 'K756', 'K770', 'K679', 'K839', 'K201', 'K1571', 'K1574', 'K1926', 'K1586', 'K1597', 'K1614', 'K1925', 'K1585', 'K1596', 'K1615', 'K1927', 'K1587', 'K1598', 'K1622', 'K2701', 'K2702', 'K2703', 'K2704', 'K2705', 'K1801', 'K1802', 'K1803', 'K1804', 'K1805', 'K1901', 'K1902', 'K1903', 'K1904', 'K1905', 'K1616', 'K1617', 'K1618', 'K1620', 'K1621')
checkdf_final$State_poid <- paste0(checkdf_final$stateId, checkdf_final$poId)
checkdf_final <- left_join(checkdf_final, checkdf4, by=c("State_poid")) # Merge by state poid

## Data cleaning to remove commas from numeric NACS variables



georef_data_missing <- georef_data %>% filter(NASSGEOID %in% setdiff(georef_data$NASSGEOID, checkdf_final$otherLink))
table(georef_data_missing$STATE)


## Join list frame data with parcel data - this requires some partial matching as described in the paper, e.g., on nicknames
# Convert some variables to characters to make string distance, etc. joins easier
georef_data$PARCEL_ID <- as.character(georef_data$PARCEL_ID)
georef_data$NASSGEOID <- as.character(georef_data$NASSGEOID)
georef_data$ADDR <- as.character(georef_data$ADDR)
georef_data$STD_ADDR <- as.character(georef_data$STD_ADDR)
# Create full name variable for matching with list frame wholeName
georef_data$FULLNAME <- as.character(paste(georef_data$OWN1_FRST, georef_data$OWN1_LAST, sep=" "))
georef_data$OWN1_LAST <- as.character(georef_data$OWN1_LAST)
georef_data$OWN1_FRST <- as.character(georef_data$OWN1_FRST)
checkdf_final$addrDelivery <- as.character(checkdf_final$addrDelivery)
checkdf_final$wholeName <- as.character(checkdf_final$wholeName)






# Check which otherLinks are not in the NASSGEOIDS
# In some cases this could be missing CoreLogic data, or in some cases parcel IDs were used as otherLinks
missing_corelogic_geoids <- setdiff(checkdf_final$otherLink, georef_data$NASSGEOID)
# Start with the otherLinks that *are* in the CoreLogic data
checkdf_in_cl <- checkdf_final %>% filter(!(otherLink %in% missing_corelogic_geoids))
georef_data_to_link <- georef_data #%>% filter(STATE_CODE==46)
georef_data_lk <- full_join(georef_data_to_link, checkdf_in_cl, by=c("NASSGEOID" = "otherLink"))


# Some but not all Missouri otherLinks are parcel IDs - the ones that are not in CoreLogic NASSGEOIDs
checkdf_mo <- checkdf_final %>% filter(otherLink %in% missing_corelogic_geoids & stateId==29)
georef_data_mo <- full_join(georef_data_to_link, checkdf_mo, by=c("PARCEL_ID" = "otherLink"))
georef_data_mo <- georef_data_mo %>% filter(PARCEL_ID %in% missing_corelogic_geoids & (stateId==29|STATE_CODE==29)) # Filter out only Missouri parcels that have otherLinks not in CoreLogic NASSGEOIDs

# There are 9 in Iowa with parcelID as otherLinks, plus there's one in Illinois with a NASSGEOID otherLink that I don't see in the CoreLogic data
checkdf_ia <- checkdf_final %>% filter(otherLink %in% missing_corelogic_geoids & stateId==19)
georef_data_ia <- full_join(georef_data_to_link, checkdf_ia, by=c("PARCEL_ID" = "otherLink"))
georef_data_ia <- georef_data_ia %>% filter(PARCEL_ID %in% missing_corelogic_geoids & (stateId==19|STATE_CODE==19)) # Filter out only Iowa parcels that have otherLinks not in CoreLogic NASSGEOIDs


# Combine data
georef_data_lk <- rbind(georef_data_lk, georef_data_mo, georef_data_ia)

# Arrange by NASSGEOID to make checking number of parcels and poids per NASSGEOID easier
georef_data_lk <- georef_data_lk %>% arrange(NASSGEOID)


# Calculate Jaro-Winkler distances between STD_ADDR and ADDR and addrDelivery in CoreLogic and list frame, respectively, and between names
georef_data_lk$STD_ADDR.distjw <- stringdist::stringdist(georef_data_lk$STD_ADDR, georef_data_lk$addrDelivery, method="jw")
georef_data_lk$ADDR.distjw <- stringdist::stringdist(georef_data_lk$ADDR, georef_data_lk$addrDelivery, method="jw")
georef_data_lk$name.distjw <- stringdist::stringdist(paste(georef_data_lk$OWN1_FRST, georef_data_lk$OWN1_LAST, sep= " "), georef_data_lk$wholeName, method="jw")

# Create variables representing any perfect match between address or name, and the closest match (minimum string distance) for each NASSGEOID, and the closest match in name only for each NASSGEOID
georef_data_lk <- georef_data_lk %>%
  mutate(perfect_match=STD_ADDR.distjw==0|ADDR.distjw==0|name.distjw==0) %>%
  group_by(NASSGEOID) %>%
  mutate(potential_match=!perfect_match & (STD_ADDR.distjw==min(STD_ADDR.distjw)|ADDR.distjw==min(ADDR.distjw)|name.distjw==min(name.distjw)), potential_match_nameonly=name.distjw==min(name.distjw)) %>%
  ungroup()

# Create variables to count number of unique parcels and poids per NASSGEOID
georef_data_lk <- georef_data_lk %>% group_by(NASSGEOID) %>%
  mutate(n_parcels=length(unique(PARCEL_ID)), n_poids=length(unique(poId))) %>%
  ungroup()

# Create variable of types of cases - 1 parcel 1 poid, 1 parcel multiple poids, multiple parcels 1 poid, multiple parcels multiple poids
georef_data_lk$casetype <- ifelse(georef_data_lk$n_parcels==1 & georef_data_lk$n_poids==1, 1, 
                                  ifelse(georef_data_lk$n_parcels==1 & georef_data_lk$n_poids>1, 2,
                                         ifelse(georef_data_lk$n_parcels>1 & georef_data_lk$n_poids==1, 3,
                                                ifelse(georef_data_lk$n_parcels>1 & georef_data_lk$n_poids>1, 4, 5))))

# Create variable to tell whether, for each NASSGEOID, all poids are NA (no linkages or criteria records)
georef_data_lk <- georef_data_lk %>% group_by(NASSGEOID) %>% mutate(all_na_poids=ifelse(all(is.na(poId)), 1, 0)) %>% ungroup()

## Add in a variable for same FULLNAME, different parcels in the same NASSGEOID - this is for situation with multiple parcels, 1 poid with same parcel owner
georef_data_lk <- georef_data_lk %>% group_by(NASSGEOID) %>% mutate(unique_names=length(unique(FULLNAME))) %>% ungroup()

## Add in case 1's and 2's - 1 parcel, 1 poid or 1 parcel, multiple poids
georef_1parcel <- georef_data_lk %>% filter(casetype %in% c(1,2))

georef_data_lk <- anti_join(georef_data_lk, georef_1parcel)

## Add in case 3's where the only poid is NA
georef_no_poids <- georef_data_lk %>% filter(all_na_poids==TRUE)

georef_data_lk <- anti_join(georef_data_lk, georef_no_poids)

## Filter out perfect matches and remove the rows with the same NASSGEOID and poid from the data
georef_perfect_match <- georef_data_lk %>% filter(perfect_match==TRUE)

georef_data_lk <- anti_join(georef_data_lk, georef_perfect_match)

## At this point get rid of any parcel IDs that you already covered
georef_data_lk2 <- rbind(georef_1parcel, georef_no_poids, georef_perfect_match)

## Include parcels where n_parcels==2, n_poids==2 and you already have one match covered!

georef_data_match2 <- georef_data_lk %>% filter(n_parcels==2 & n_poids==2 & !(poId %in% georef_perfect_match$poId) & NASSGEOID %in% georef_data_lk2$NASSGEOID)

georef_data_match2 <- georef_data_match2 %>% group_by(NASSGEOID) %>%
  mutate(min_diff=name.distjw==min(name.distjw)) %>%
  ungroup() %>%
  filter(min_diff==TRUE) %>%
  select(-min_diff)

georef_data_lk2 <- rbind(georef_data_lk2, georef_data_match2)

georef_data_lk <- anti_join(georef_data_lk, georef_data_match2)

## Include name matches with string dist jw <= 0.1
georef_data_namejw_0.1 <- georef_data_lk %>% filter(name.distjw<=0.1)
georef_data_lk2 <- rbind(georef_data_lk2, georef_data_namejw_0.1)

georef_data_lk <- anti_join(georef_data_lk, georef_data_namejw_0.1)

## Potential matches on name only - how close is this?
# Make sure there are no first name only matches and no matches on a blank name in CoreLogic
georef_data_name_pmatch <- georef_data_lk %>% filter(potential_match_nameonly==TRUE & str_detect(wholeName, OWN1_LAST) & str_length(OWN1_LAST)>1)

georef_data_lk2 <- rbind(georef_data_lk2, georef_data_name_pmatch)

georef_data_lk <- anti_join(georef_data_lk, georef_data_name_pmatch)

## Add in situation of single landowner, multiple parcels, 1 poid - things like trusts
georef_data_1owner_1poid <- georef_data_lk %>% filter(unique_names==1 & n_poids==1)

georef_data_lk2 <- rbind(georef_data_lk2, georef_data_1owner_1poid)

georef_data_lk <- anti_join(georef_data_lk, georef_data_1owner_1poid)

still_missing_poids <- setdiff(checkdf_final$poId, georef_data_lk2$poId)

georef_na_poid <- georef_data_lk2 %>% filter(is.na(poId))
table(georef_na_poid$STATE) # Almost identical to parcels with no linkages to the list frame that we can find

## EDA of data linkage results, assuming they are correct for now
## First add n_parcels for georef_data_to_link
georef_data_to_link <- georef_data_to_link %>% group_by(NASSGEOID) %>%
  add_count(name="n_parcels") %>%
  ungroup()

# This takes the linkage data, looks for whether a parcel has any criteria records associated, and also checks if the poid is NA indicating no linkage
georef_data_lk2 <- georef_data_lk2 %>%
  group_by(NASSGEOID, PARCEL_ID, State_poid) %>%
  mutate(any_criteria=max(newAddFlag,na.rm=T)==1) %>%
  ungroup() %>%
  mutate(criteria=any_criteria %in% TRUE) %>%
  mutate(linkage_status=ifelse(criteria, "Criteria", ifelse(is.na(poId), "No NACS", "Linked")))

table(georef_data_lk2$linkage_status)
# Wow that's low coverage! But which criteria records are in-scope?
georef_data_lk2$linkage_maybe <- ifelse(georef_data_lk2$linkage_status=="Linked", "Linked", "Not linked")

# Add in NACS response data
table(georef_data_lk2$linkage_status, is.na(georef_data_lk2$Current_status_code))
# Check overlap between NACS state poids and list frame state poids
length(setdiff(georef_data_lk2$State_poid, checkdf3$State_poid))
length(intersect(georef_data_lk2$State_poid, checkdf3$State_poid))
length(setdiff(checkdf3$State_poid, georef_data_lk2$State_poid))
# ^ This may be everyone that got a NACS survey in the spec states, not just the CoreLogic people

## First some very basic EDA
# Get rid of parcels with missing NASSGEOID!
georef_data_lk2 <- georef_data_lk2 %>% filter(!is.na(NASSGEOID))
# Fet rid of Iowa
georef_data_lk2 <- georef_data_lk2 %>% filter(STATE != "IA")
# Option for now - remove NA poids
georef_data_lk2 <- georef_data_lk2 %>% filter(!is.na(poId))

# ^ Interesting! Keep in the 47 IA poids that have parcels in other states? This is an interesting thing here!
georef_data_lk2$missing_nacs_code <- is.na(georef_data_lk2$Current_status_code)

## What percent of parcels, poids are covered by this analysis
length(unique(georef_data_lk2$PARCEL_ID))/length(unique(georef_data$PARCEL_ID))
(length(unique(georef_data_lk2$State_poid)))/length(unique(checkdf_final$State_poid))

##Parcel-level analysis - this is not used for analysis, see "georef_clean_unique_poids" below
georef_data_lk2 <- georef_data_lk2 %>% group_by(NASSGEOID, PARCEL_ID) %>% 
  mutate(any_linkage=ifelse(any(linkage_status %in% c("Linked")), 1, 0),
         any_nacs=ifelse(any(!is.na(Current_status_code)), 1, 0),
         any_response=ifelse(any(Current_status_code>=50, na.rm=T), 1, 0),
         in_scope=ifelse(any(Current_status_code %in% c(61)), 1, 0),
         any_women=ifelse(any(K1926 %in% c(2) | K1586 %in% c(2) | K1597 %in% c(2) | K1614 %in% c(2)), 1, 0),
         any_hispanic_latino=ifelse(any(K1927 %in% c(1) | K1587 %in% c(1) | K1598 %in% c(1) | K1622 %in% c(1)), 1, 0),
         any_black=ifelse(any(K2702 %in% c(1) | K1802 %in% c(1) | K1902 %in% c(1) | K1617 %in% c(1)), 1, 0),
         any_aian=ifelse(any(K2703 %in% c(1) | K1803 %in% c(1) | K1903 %in% c(1) | K1618 %in% c(1)), 1, 0),
         any_asian=ifelse(any(K2705 %in% c(1) | K1805 %in% c(1) | K1905 %in% c(1) | K1620 %in% c(1)), 1, 0),
         any_nhpi=ifelse(any(K2704 %in% c(1) | K1804 %in% c(1) | K1904 %in% c(1) | K1621 %in% c(1)), 1, 0),
         any_fruit=ifelse(any(K400>0), 1, 0),
         any_veg=ifelse(any(K689>0), 1, 0),
         any_greenhouse=ifelse(any(K939 %in% c(1)), 1, 0),
         any_dairy=ifelse(any(K615>0), 1, 0),
         any_chickens=ifelse(any(K767>0 | K892>0), 1, 0),
         any_livestock=ifelse(any(K610>0 | K615>0 | K630>0 | K640>0 | K645>0 | K675>0 | K767>0 | K892>0 | K770>0), 1, 0),
         any_sales_none=ifelse(any(K201 %in% c(2)), 1, 0),
         any_sales_under10k=ifelse(any(K201 %in% c(3,4,5,6)), 1, 0),
         mean_age=mean(c(K1925,K1585,K1596,K1615),na.rm=T),
         mean_acres_owned=mean(K111,na.rm=T),
         mean_acres_rented_from=mean(K112,na.rm=T),
         mean_acres_rented_to=mean(K113,na.rm=T),
         mean_acres_operated=mean(K900,na.rm=T)) %>%
  ungroup() %>%
  group_by(State_poid) %>%
  mutate(any_response_poid=ifelse(any(Current_status_code>=50, na.rm=T), 1, 0),
         in_scope_poid=ifelse(any(Current_status_code %in% c(61)), 1, 0)) %>%
  ungroup()


# Before parcel-level analysis, a bit of a poid-level analysis
# This part runs the code and calculates the variable to describe in-scope criteria records, the main part of this analysis
# This code groups the data by poid (so a poid could have multiple parcels) and counts parcels, various NACS responses (indicated by variable names starting with "K"), linkage to the list frame, whether they responded to NACS, whether they were in-scope, etc.
georef_clean_unique_poids <- georef_data_lk2 %>%
  group_by(State_poid) %>%
  summarize(active_status=mean(activeStatusId, na.rm=T),
            stateId=mean(stateId, na.rm=T),
            FIPS_CODE=mean(as.numeric(FIPS_CODE), na.rm=T),
            n_parcels=length(unique(PARCEL_ID)),
            n_polygons=length(unique(NASSGEOID)),
            any_manual=ifelse(any(manual %in% c(1)), 1, 0),
            any_women=ifelse(any(K1926 %in% c(2) | K1586 %in% c(2) | K1597 %in% c(2) | K1614 %in% c(2)), 1, 0),
            any_hispanic_latino=ifelse(any(K1927 %in% c(1) | K1587 %in% c(1) | K1598 %in% c(1) | K1622 %in% c(1)), 1, 0),
            any_black=ifelse(any(K2702 %in% c(1) | K1802 %in% c(1) | K1902 %in% c(1) | K1617 %in% c(1)), 1, 0),
            any_aian=ifelse(any(K2703 %in% c(1) | K1803 %in% c(1) | K1903 %in% c(1) | K1618 %in% c(1)), 1, 0),
            any_asian=ifelse(any(K2705 %in% c(1) | K1805 %in% c(1) | K1905 %in% c(1) | K1620 %in% c(1)), 1, 0),
            any_nhpi=ifelse(any(K2704 %in% c(1) | K1804 %in% c(1) | K1904 %in% c(1) | K1621 %in% c(1)), 1, 0),
            any_fruit=ifelse(any(K400>0), 1, 0),
            any_veg=ifelse(any(K689>0), 1, 0),
            any_greenhouse=ifelse(any(K939 %in% c(1)), 1, 0),
            any_dairy=ifelse(any(K615>0), 1, 0),
            any_chickens=ifelse(any(K767>0 | K892>0), 1, 0),
            any_livestock=ifelse(any(K610>0 | K615>0 | K630>0 | K640>0 | K645>0 | K675>0 | K767>0 | K892>0 | K770>0), 1, 0),
            mean_cattle=mean(K610, na.rm=T),
            mean_dairy=mean(K615, na.rm=T),
            mean_hogs=mean(K630, na.rm=T),
            mean_sheep=mean(K640, na.rm=T),
            mean_goats=mean(K645, na.rm=T),
            mean_horses=mean(K675, na.rm=T),
            mean_meat_chickens=mean(K767, na.rm=T),
            mean_layer_chickens=mean(K892, na.rm=T),
            mean_turkeys=mean(K770, na.rm=T),
            any_sales_none=ifelse(any(K201 %in% c(2)), 1, 0),
            any_sales_under10k=ifelse(any(K201 %in% c(3,4,5,6)), 1, 0),
            mean_age=mean(c(K1925,K1585,K1596,K1615),na.rm=T),
            mean_acres_owned=mean(K111,na.rm=T),
            mean_acres_rented_from=mean(K112,na.rm=T),
            mean_acres_rented_to=mean(K113,na.rm=T),
            mean_acres_operated=mean(K900, na.rm=T),
            is_amish=any(amish_score_settlement %in% c(1)),
            is_metro_area=any(!is.na(metropolitanmicropolitanstatis)),
            is_metro_metro=any(metropolitanmicropolitanstatis=="Metropolitan Statistical Area"),
            not_on_list_frame=any(newAddFlag %in% c(1)),
            sent_survey=any(!is.na(Current_status_code)),
            any_response=any(Current_status_code>=50, na.rm=T),
            in_scope=any(Current_status_code %in% c(61)),
            inscope_criteria=sum(newAddFlag %in% c(1) & Current_status_code %in% c(61),na.rm=T)) %>%
  mutate(acres_csb=ifelse(n_polygons==1, acres_csb, acres_csb*n_polygons)) %>%
  ungroup()

table(georef_clean_unique_poids$any_manual)
table(georef_clean_unique_poids$not_on_list_frame)
table(georef_clean_unique_poids$sent_survey)
table(georef_clean_unique_poids$sent_survey, georef_clean_unique_poids$any_response)
table(georef_clean_unique_poids$any_response, georef_clean_unique_poids$in_scope)


table(georef_clean_unique_poids$any_manual, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$is_amish, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$is_metro_area, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$is_metro_metro, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_sales_none, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_sales_under10k, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
tapply(georef_clean_unique_poids$mean_acres_owned, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_acres_owned, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_acres_rented_from, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_acres_rented_from, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_acres_rented_to, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_acres_rented_to, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_acres_operated, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_acres_operated, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]

table(georef_clean_unique_poids$any_fruit, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_veg, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_greenhouse, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_livestock, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_dairy, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_chickens, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]


table(georef_clean_unique_poids$any_women, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_hispanic_latino, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_black, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_aian, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_asian, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]
table(georef_clean_unique_poids$any_nhpi, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[2,2,2]

table(georef_clean_unique_poids$stateId, georef_clean_unique_poids$any_manual, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame)[,,2,2]

# Mean numbers of livestock
georef_poids_new_farms <- georef_clean_unique_poids %>% filter(in_scope==1 & not_on_list_frame==1)
georef_poids_linked <- georef_clean_unique_poids %>% filter(not_on_list_frame==0)
# Alternative approach
tapply(georef_clean_unique_poids$mean_cattle, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_cattle, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_dairy, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_dairy, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_hogs, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_hogs, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_sheep, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_sheep, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_goats, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_goats, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_horses, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_horses, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_meat_chickens, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_meat_chickens, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_layer_chickens, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_layer_chickens, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]
# ^ So that is interesting. The first non-zero median number of livestock I've seen. OK. So yeah lots of backyard egg layers
tapply(georef_clean_unique_poids$mean_turkeys, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){mean(x,na.rm=T)})[2,2]
tapply(georef_clean_unique_poids$mean_turkeys, list(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$not_on_list_frame), function(x){median(x,na.rm=T)})[2,2]


# Active status codes relative to other things like on list frame/not, survey sent, response/scope status
table(georef_clean_unique_poids$active_status)
table(georef_clean_unique_poids$sent_survey, georef_clean_unique_poids$active_status)
table(georef_clean_unique_poids$sent_survey, georef_clean_unique_poids$in_scope, georef_clean_unique_poids$active_status)
table(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$active_status)
table(georef_clean_unique_poids$in_scope, georef_clean_unique_poids$active_status, georef_clean_unique_poids$sent_survey)
table(georef_clean_unique_poids$any_response)
table(georef_clean_unique_poids$any_response, georef_clean_unique_poids$active_status)

########
#### Pull in control data for linked farms (farms already on the list frame) ####
statefipsvec <- c("05", 17, 18, 19, 20, 27, 29, 31, 38, 39, 46, 55)
checkdf5 <- data.frame(stateId=0, poId=0, wholeName=0, operID=0, addrDelivery=0, addrDelivery=0, placeName=0, stateAlpha=0, zip5=0, otherLink=0, activestatusId=0, raceId=0, sex=0, spanishOriginId=0, controlTypeId=0, cSourceId=0, dtReceived=0, value=0) # Add column names as necessary
for(i in statefipsvec){ # Using state FIPS code vector that was used to define border counties
  print(i)
  temp_query <- 000 ## Redacted due to NASS database confidentiality
  
  channel <<- 000 ## Redacted due to NASS database confidentiality
  tempdf <- dbGetQuery(channel, temp_query)
  
  dbDisconnect(channel) # Disconnect
  names(checkdf5) <- names(tempdf) # Make sure names match so you can add these states to the list
  
  checkdf5 <- rbind(checkdf5, tempdf) # Add this state to the data frame
  
}

checkdf5 <- checkdf5[-1,]
checkdf5 <- checkdf5[,-5]
checkdf5 <- checkdf5 %>% select(-cSourceId, -dtReceived)
# Reshape data to make control data into columns, then merge
checkdf5_wide <- checkdf5 %>% pivot_wider(names_from=c(controlTypeId), values_from=value, names_prefix=c("ct_"))
# For now get rid of date received and cSourceId
#checkdf5_wide <- checkdf5_wide %>% distinct(stateId, poId, wholeName, .keep_all = TRUE)

checkdf5_wide <- checkdf5_wide %>% select(-wholeName, -addrDelivery, -placeName, -stateAlpha, -zip5, -activeStatusId)
checkdf5_wide$State_poid <- paste0(as.numeric(checkdf5_wide$stateId), checkdf5_wide$poId)
georef_clean_unique_poids <- left_join(georef_clean_unique_poids, checkdf5_wide, by=c("stateId", "State_poid"))
georef_clean_unique_poids <- georef_clean_unique_poids %>% distinct(State_poid, .keep_all=TRUE)
# Some analysis of demographic and control data
# Make sure you limit this to those on the list frame already
georef_clean_unique_poids <- georef_clean_unique_poids %>%
  mutate(
    ct_any_chickens=ifelse(ct_broilers>0 | ct_fryers>0, 1, 0),
    ct_any_livestock=ifelse(ct_cattle>0 | ct_milk>0 | ct_hogs>0 | ct_sheep>0 | ct_goats>0 | ct_horses>0 | ct_broilers>0 | ct_fryers>0 | ct_turkeys>0, 1, 0),
    ct_any_sales_none=ifelse(ct_nacs_vos %in% c(2), 1, 0),
    ct_any_sales_under10k=ifelse(ct_nacs_vos %in% c(3,4,5,6), 1, 0),
    ct_acres_owned=ct_acres_owned,
    ct_acres_operated=ct_acres_operated
  )


table(georef_clean_unique_poids$ct_any_sales_none, georef_clean_unique_poids$not_on_list_frame)
table(georef_clean_unique_poids$ct_any_sales_under10k, georef_clean_unique_poids$not_on_list_frame)
tapply(georef_clean_unique_poids$ct_acres_owned, georef_clean_unique_poids$not_on_list_frame, function(x){median(x,na.rm=T)})
tapply(georef_clean_unique_poids$ct_acres_operated, georef_clean_unique_poids$not_on_list_frame, function(x){median(x,na.rm=T)})
table(georef_clean_unique_poids$ct_any_livestock, georef_clean_unique_poids$not_on_list_frame)
table(georef_clean_unique_poids$ct_any_chickens, georef_clean_unique_poids$not_on_list_frame)