Stata: Classify Neighbors

cap program drop classifyNeighbors 
program define classifyNeighbors

    args coordinatesFile /// 
         longitudeVar /// 
         latitudeVar /// 
         coordinatesKEEP_STRINGS /// 
         distanceFile /// 
         distanceUnit /// 
         distanceVar ///
         distanceKEEP_STRINGS /// 
         areaFile /// 
         areaUnit /// 
         areaVar /// 
         areaKEEP_STRINGS /// 
         neighbroingLinesFile /// 
         rootDir /// 
         PATH_GIS /// 
         geoid /// 
         idVar /// 
         maxNeighborDegree 
    
    ***********************IMPORTANT NOTE ABOUT FILES***************************
    * The files used for the coordinates, area, distance, and neighboring lines 
    * are allowed to be different files, although they do not have to be, but it 
    * is crucial that they all are related to some parent file such that the 
    * variable "objectid" which is assigned by ArcGIS is the same for all the 
    * spatial units. This means that you cannot use a file with the coordinates 
    * of the centroids of the spatial units that originated from a different 
    * shapefile, and clearly not from a file that does not even contain the 
    * "objectid" variable. 
    ****************************************************************************
    * Arguments Legend 
    * coordinatesFile: file which stores the coordinates of each centroid per 
    * each spatial unit. 
    * longitudeVar: variable with the longitude coordinate 
    * latitudeVar: variable with the latitude coordinate
    * distanceFile: The file which stores the point distance, centroid to  
    * centroid, between your spatial units. 
    * distanceUnit: meters, km, feet, miles, etc.
    * distanceVar: variable name for the distance variable
    * areaFile: The file stores the total area of your spatial unit. Could be 
    * the same file as the distanceFile, but ideally you want to use different 
    * projections (distance preserving, area preserving, etc.) to reduce the 
    * error in each variable. 
    * areaUnit: squared km, squared feet, etc. 
    * areaVar: variable name for the area variable 
    * neighbroingLinesFile: This file stores the data from the XXXXXX tool, from 
    * ArcGIS. 
    * rootDir: path to your library.
    * geoid: variable that identifies the spatial unit in ArcGIS 
    * idVar: variable that identifies the spatial unit from hereafter
    * maxNeighborDegree: how many neighboring degrees should be classified. By 
    * choosing 1 the program will classify only the first degree neighbors, by 
    * choosing 2 it will also classify the neighbor of the neighbor (2nd degree 
    * neighbor), by choosing 3 it will classify the neighbor of the neighbor of 
    * the neighbor (3rd degree neighbor), and so on. 
    * CAUTION: Choosing higher maximum degrees results in very long runtimes. 
    * This program is not very efficient and could take a lot of time (and 
    * require a lot of RAM) to process high degree of neighboring relationship. 

    tempfile coordinatesTable /// 
             distanceTable ///
             areaTable /// 
             neighboringCounties



    * Get coordinates for each county centroid 
    import delimited "`rootDir'/`PATH_GIS'/`coordinatesFile'", /// 
           clear stringcols(`coordinatesKEEP_STRINGS') 

    rename `geoid' `idVar'
    rename objectid gisID
    keep gisID `idVar' `longitudeVar' `latitudeVar'
    rename `longitudeVar' longitude
    rename `latitudeVar' latitude 

    sort gisID  
    save `coordinatesTable', replace


    * Import the distance matrix between county centroids 
    * Use the stringcols option to preserve leading zeros in your id variable
    import delimited "`rootDir'/`PATH_GIS'/`distanceFile'", /// 
           clear stringcols(`distanceKEEP_STRINGS')

    rename `geoid' `idVar'Prime 
    rename `geoid'_1 `idVar'
    * This would be the place to convert, if needed, the distance units.
    label var `distanceVar' "Distance (in `distanceUnit')"

    keep `idVar'Prime `idVar' `distanceVar' 
    sort `idVar'Prime `idVar' 
    save `distanceTable', replace 


    * Import the area of each spatial unit 
    import delimited "`rootDir'/`PATH_GIS'/`areaFile'", /// 
           clear stringcols(`areaKEEP_STRINGS')

    rename `geoid' `idVar'  

    keep `idVar' `areaVar'
    label var `areaVar' "Area (in `areaUnit')"

    sort `idVar' 
    save `areaTable', replace 

    * Now that we have the distance between centroids, and the area of each 
    * spatial unit stored in memory we can continue to classify the neighboring 
    * relationship between the spatial units. We need to prepare one more file. 

    * Import the information about the neighboring lines, the lines that identify 
    * the spatial units that share a border. 
    import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear 

    keep objectid left_fid right_fid 
    * Drop the lines that are on the edge, meaning that there is no further spatial 
    * unit beyond that line. 
    drop if left_fid < 0 | right_fid < 0
    * No point in having duplicate line sets that identify the same neighboring 
    * spatial unit so we keep only the unique combinations.
    bysort left_fid right_fid: keep if _n == 1
    rename left_fid gisID 
    rename right_fid neighborID
    * We now have the the neighbors to the right of each spatial unit
    sort gisID 
    save `neighboringCounties', replace 

    * Repeating to find the neighbors on the left of each spatial unit 
    import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear 

    keep objectid left_fid right_fid 
    drop if left_fid < 0 | right_fid < 0
    bysort left_fid right_fid: keep if _n == 1
    rename right_fid gisID 
    rename left_fid neighborID 
    * We now have the neighbors to the left of each spatial unit 
    sort gisID 

    * Combine the neighbors to the left and to the right of each spatial unit by 
    * appending the previous file saved. 
    append using `neighboringCounties'
    keep gisID neighborID

    save `neighboringCounties', replace 


    * We now have all the needed files ready and we can start the classification.
    use `coordinatesTable', clear 

    sort `idVar'
    merge 1:1 `idVar' using `areaTable'
    keep if _merge == 3 
    drop _merge 

    rename `idVar' `idVar'Prime 
    rename longitude longitudePrime
    rename latitude latitudePrime 
    rename `areaVar' areaPrime 

    * Each observation is a spatial unit that serves as an anchor, denoted as PRIME. 
    * Now we start identifying the neighboring units of each PRIME unit.   

    * Match spatial units based on their GIS objectid numbers 
    merge 1:m gisID using `neighboringCounties'
    keep if _merge == 3 
    drop _merge 
    * This step just gave us the matching between each unique prime spatial unit 
    * and all its first degree neighbors. Next we want to add the data about the 
    * coordinates of the first degree neighbors and the distance between the prime 
    * units and the first degree neighbors. 

    * Flag the neighborID as the GIS objectid number (gisID)
    drop gisID
    rename neighborID gisID 
    sort gisID 
    * Match the gisID with the idVar and geolocation data 
    merge m:1 gisID using `coordinatesTable'
    keep if _merge == 3
    drop _merge  
    * Get the distance data 
    sort `idVar'Prime `idVar'
    merge 1:1 `idVar'Prime `idVar' using `distanceTable'
    keep if _merge == 3
    drop _merge
    * Get the area data 
    sort `idVar'
    merge m:1 `idVar' using `areaTable'
    keep if _merge == 3
    drop _merge 

    * Rename 
    rename `idVar' `idVar'Neighbor1
    rename longitude longitudeNeighbor1 
    rename latitude latitudeNeighbor1 
    rename `distanceVar' distanceNeighbor1 
    rename `areaVar' areaNeighbor1 

    * Next we are going to repeat the above process for second degree neighbors and 
    * so on, up to the neighboring degree determined by the maxNeighborDegree 
    * parameter. 

    * This will simply be used to order the variables nicely at the end.
    local sortOrder = ""
    forvalues degreeN = 2(1)`maxNeighborDegree' {
        sort gisID 
        joinby gisID using `neighboringCounties'

        drop gisID
        rename neighborID gisID 
        sort gisID 

        merge m:1 gisID using `coordinatesTable'
        keep if _merge == 3
        drop _merge  

        sort `idVar'Prime `idVar'
        merge m:1 `idVar'Prime `idVar' using `distanceTable'
        drop if _merge == 2
        drop _merge

        sort `idVar' 
        merge m:1 `idVar' using `areaTable'
        keep if _merge == 3 
        drop _merge 

        rename `idVar' `idVar'Neighbor`degreeN'
        rename longitude longitudeNeighbor`degreeN'
        rename latitude latitudeNeighbor`degreeN'
        rename `distanceVar' distanceNeighbor`degreeN'
        rename `areaVar' areaNeighbor`degreeN' 

        local sortOrder = "`sortOrder' " + "`idVar'Neighbor`degreeN'"
    }

    sort `idVar'Prime `idVar'Neighbor1 `sortOrder'

    * Removing duplicates. 
    * This process takes the most amount of time relative to the other pieces and 
    * this is also the part that does not scale well, meaning that computational 
    * time increases exponentially with the number of max neighbor degree. 

    * Counties should not be neighbors of themselves 
    forvalues degreeN = 2(1)`maxNeighborDegree' {
        drop if `idVar'Prime == `idVar'Neighbor`degreeN'
    }

    * Neighboring counties should only be counted once 
    local maxNeighborDegreeMinus1 = `maxNeighborDegree' -1
    qui levelsof `idVar'Prime, local(`idVar'Levels)
    * The following two locals will help generate a progress bar 
    local idN: word count "``idVar'Levels'"
    local k = 1 
    foreach idPrime in ``idVar'Levels' {
      forvalues degreeN = 1(1)`maxNeighborDegreeMinus1' {
        qui levelsof `idVar'Neighbor`degreeN' if `idVar'Prime == "`idPrime'", local(`idVar'N`degreeN')
        local degreeN_Plus_1 = `degreeN' + 1
        foreach idCodeN`degreeN' in ``idVar'N`degreeN'' {
          qui drop if `idVar'Neighbor`degreeN_Plus_1' == "`idCodeN`degreeN''" /// 
                      & `idVar'Prime == "`idPrime'"    
        }    
      }
      * I included a progress so there will be some meaningful output while this runs 
      local PROGRESS = "Progress (Neighbor): " + string(round(`k'/`idN'*100,0.01)) + "%"
      di "`PROGRESS'"
      local k = `k' + 1
    }

    drop gisID

    * Each spatial unit, identified by `idVar'Prime, has its neighbors of degree 1, 
    * identified by `idVar'Neighbor1, and its neighbors of degree 2, identified by 
    * `idVar'Neighbor2; and so on. 
    * 
    * Notice that the number of observations per-spatial unit is determined by the 
    * number of unique spatial units in the the highest neighboring degree, meaning 
    * there are some duplicates left in the lower levels. 


    * This part makes it easier to merge the resulting file based on `idVar'
    gen `idVar' = `idVar'Prime
    sort `idVar'
    save `neighboringCounties', replace 

end

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

cap program drop classifyNeighbors

program define classifyNeighbors

args coordinatesFile ///

longitudeVar ///

latitudeVar ///

coordinatesKEEP_STRINGS ///

distanceFile ///

distanceUnit ///

distanceVar ///

distanceKEEP_STRINGS ///

areaFile ///

areaUnit ///

areaVar ///

areaKEEP_STRINGS ///

neighbroingLinesFile ///

rootDir ///

PATH_GIS ///

geoid ///

idVar ///

maxNeighborDegree

***********************IMPORTANT NOTE ABOUT FILES***************************

* The files used for the coordinates, area, distance, and neighboring lines

* are allowed to be different files, although they do not have to be, but it

* is crucial that they all are related to some parent file such that the

* variable "objectid" which is assigned by ArcGIS is the same for all the

* spatial units. This means that you cannot use a file with the coordinates

* of the centroids of the spatial units that originated from a different

* shapefile, and clearly not from a file that does not even contain the

* "objectid" variable.

****************************************************************************

* Arguments Legend

* coordinatesFile: file which stores the coordinates of each centroid per

* each spatial unit.

* longitudeVar: variable with the longitude coordinate

* latitudeVar: variable with the latitude coordinate

* distanceFile: The file which stores the point distance, centroid to

* centroid, between your spatial units.

* distanceUnit: meters, km, feet, miles, etc.

* distanceVar: variable name for the distance variable

* areaFile: The file stores the total area of your spatial unit. Could be

* the same file as the distanceFile, but ideally you want to use different

* projections (distance preserving, area preserving, etc.) to reduce the

* error in each variable.

* areaUnit: squared km, squared feet, etc.

* areaVar: variable name for the area variable

* neighbroingLinesFile: This file stores the data from the XXXXXX tool, from

* ArcGIS.

* rootDir: path to your library.

* geoid: variable that identifies the spatial unit in ArcGIS

* idVar: variable that identifies the spatial unit from hereafter

* maxNeighborDegree: how many neighboring degrees should be classified. By

* choosing 1 the program will classify only the first degree neighbors, by

* choosing 2 it will also classify the neighbor of the neighbor (2nd degree

* neighbor), by choosing 3 it will classify the neighbor of the neighbor of

* the neighbor (3rd degree neighbor), and so on.

* CAUTION: Choosing higher maximum degrees results in very long runtimes.

* This program is not very efficient and could take a lot of time (and

* require a lot of RAM) to process high degree of neighboring relationship.

tempfile coordinatesTable ///

distanceTable ///

areaTable ///

neighboringCounties

* Get coordinates for each county centroid

import delimited "`rootDir'/`PATH_GIS'/`coordinatesFile'", ///

clear stringcols(`coordinatesKEEP_STRINGS')

rename `geoid' `idVar'

rename objectid gisID

keep gisID `idVar' `longitudeVar' `latitudeVar'

rename `longitudeVar' longitude

rename `latitudeVar' latitude

sort gisID

save `coordinatesTable', replace

* Import the distance matrix between county centroids

* Use the stringcols option to preserve leading zeros in your id variable

import delimited "`rootDir'/`PATH_GIS'/`distanceFile'", ///

clear stringcols(`distanceKEEP_STRINGS')

rename `geoid' `idVar'Prime

rename `geoid'_1 `idVar'

* This would be the place to convert, if needed, the distance units.

label var `distanceVar' "Distance (in `distanceUnit')"

keep `idVar'Prime `idVar' `distanceVar'

sort `idVar'Prime `idVar'

save `distanceTable', replace

* Import the area of each spatial unit

import delimited "`rootDir'/`PATH_GIS'/`areaFile'", ///

clear stringcols(`areaKEEP_STRINGS')

rename `geoid' `idVar'

keep `idVar' `areaVar'

label var `areaVar' "Area (in `areaUnit')"

sort `idVar'

save `areaTable', replace

* Now that we have the distance between centroids, and the area of each

* spatial unit stored in memory we can continue to classify the neighboring

* relationship between the spatial units. We need to prepare one more file.

* Import the information about the neighboring lines, the lines that identify

* the spatial units that share a border.

import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear

keep objectid left_fid right_fid

* Drop the lines that are on the edge, meaning that there is no further spatial

* unit beyond that line.

drop if left_fid < 0 | right_fid < 0

* No point in having duplicate line sets that identify the same neighboring

* spatial unit so we keep only the unique combinations.

bysort left_fid right_fid: keep if _n == 1

rename left_fid gisID

rename right_fid neighborID

* We now have the the neighbors to the right of each spatial unit

sort gisID

save `neighboringCounties', replace

* Repeating to find the neighbors on the left of each spatial unit

import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear

keep objectid left_fid right_fid

drop if left_fid < 0 | right_fid < 0

bysort left_fid right_fid: keep if _n == 1

rename right_fid gisID

rename left_fid neighborID

* We now have the neighbors to the left of each spatial unit

sort gisID

* Combine the neighbors to the left and to the right of each spatial unit by

* appending the previous file saved.

append using `neighboringCounties'

keep gisID neighborID

save `neighboringCounties', replace

* We now have all the needed files ready and we can start the classification.

use `coordinatesTable', clear

sort `idVar'

merge 1:1 `idVar' using `areaTable'

keep if _merge == 3

drop _merge

rename `idVar' `idVar'Prime

rename longitude longitudePrime

rename latitude latitudePrime

rename `areaVar' areaPrime

* Each observation is a spatial unit that serves as an anchor, denoted as PRIME.

* Now we start identifying the neighboring units of each PRIME unit.

* Match spatial units based on their GIS objectid numbers

merge 1:m gisID using `neighboringCounties'

keep if _merge == 3

drop _merge

* This step just gave us the matching between each unique prime spatial unit

* and all its first degree neighbors. Next we want to add the data about the

* coordinates of the first degree neighbors and the distance between the prime

* units and the first degree neighbors.

* Flag the neighborID as the GIS objectid number (gisID)

drop gisID

rename neighborID gisID

sort gisID

* Match the gisID with the idVar and geolocation data

merge m:1 gisID using `coordinatesTable'

keep if _merge == 3

drop _merge

* Get the distance data

sort `idVar'Prime `idVar'

merge 1:1 `idVar'Prime `idVar' using `distanceTable'

keep if _merge == 3

drop _merge

* Get the area data

sort `idVar'

merge m:1 `idVar' using `areaTable'

keep if _merge == 3

drop _merge

* Rename

rename `idVar' `idVar'Neighbor1

rename longitude longitudeNeighbor1

rename latitude latitudeNeighbor1

rename `distanceVar' distanceNeighbor1

rename `areaVar' areaNeighbor1

* Next we are going to repeat the above process for second degree neighbors and

* so on, up to the neighboring degree determined by the maxNeighborDegree

* parameter.

* This will simply be used to order the variables nicely at the end.

local sortOrder = ""

forvalues degreeN = 2(1)`maxNeighborDegree' {

sort gisID

joinby gisID using `neighboringCounties'

drop gisID

rename neighborID gisID

sort gisID

merge m:1 gisID using `coordinatesTable'

keep if _merge == 3

drop _merge

sort `idVar'Prime `idVar'

merge m:1 `idVar'Prime `idVar' using `distanceTable'

drop if _merge == 2

drop _merge

sort `idVar'

merge m:1 `idVar' using `areaTable'

keep if _merge == 3

drop _merge

rename `idVar' `idVar'Neighbor`degreeN'

rename longitude longitudeNeighbor`degreeN'

rename latitude latitudeNeighbor`degreeN'

rename `distanceVar' distanceNeighbor`degreeN'

rename `areaVar' areaNeighbor`degreeN'

local sortOrder = "`sortOrder' " + "`idVar'Neighbor`degreeN'"

}

sort `idVar'Prime `idVar'Neighbor1 `sortOrder'

* Removing duplicates.

* This process takes the most amount of time relative to the other pieces and

* this is also the part that does not scale well, meaning that computational

* time increases exponentially with the number of max neighbor degree.

* Counties should not be neighbors of themselves

forvalues degreeN = 2(1)`maxNeighborDegree' {

drop if `idVar'Prime == `idVar'Neighbor`degreeN'

}

* Neighboring counties should only be counted once

local maxNeighborDegreeMinus1 = `maxNeighborDegree' -1

qui levelsof `idVar'Prime, local(`idVar'Levels)

* The following two locals will help generate a progress bar

local idN: word count "``idVar'Levels'"

local k = 1

foreach idPrime in ``idVar'Levels' {

forvalues degreeN = 1(1)`maxNeighborDegreeMinus1' {

qui levelsof `idVar'Neighbor`degreeN' if `idVar'Prime == "`idPrime'", local(`idVar'N`degreeN')

local degreeN_Plus_1 = `degreeN' + 1

foreach idCodeN`degreeN' in ``idVar'N`degreeN'' {

qui drop if `idVar'Neighbor`degreeN_Plus_1' == "`idCodeN`degreeN''" ///

& `idVar'Prime == "`idPrime'"

}

* I included a progress so there will be some meaningful output while this runs

local PROGRESS = "Progress (Neighbor): " + string(round(`k'/`idN'*100,0.01)) + "%"

di "`PROGRESS'"

local k = `k' + 1

}

drop gisID

* Each spatial unit, identified by `idVar'Prime, has its neighbors of degree 1,

* identified by `idVar'Neighbor1, and its neighbors of degree 2, identified by

* `idVar'Neighbor2; and so on.

* Notice that the number of observations per-spatial unit is determined by the

* number of unique spatial units in the the highest neighboring degree, meaning

* there are some duplicates left in the lower levels.

* This part makes it easier to merge the resulting file based on `idVar'

gen `idVar' = `idVar'Prime

sort `idVar'

save `neighboringCounties', replace

end