# Forschungsinteressen von Frauen und Männern in der Psychologie # André Bittermann, Nina Greiner, Andreas Fischer ################################################################ library(quanteda) library(dplyr) library(stringr) load("diss.RData") ### determine author gender ---- library(genderizeR) # If you work with free API plan you are limited to 1000 queries a day. givenNames_part1 <- findGivenNames(diss$author12) save(givenNames_part1, file="givenNames_part1.RData") nrow(givenNames_part1) # see when algorithm stopped firstnames <- gsub("_", " ", diss$author12) # otherwise, names like "Anna_K" will not be recognized # apply to all names genderized <- genderize(firstnames, genderDB = givenNames_part1, progress = TRUE) table(genderized$genderIndicators) # using givenNames_part1, 13,213 names (73.2 %) could be genderized colnames(genderized)[1] <- "firstnames" diss_temp <- cbind(diss, genderized) # merge # wait for 24 hours (the Sys.sleep(x) function could be applied) missing <- diss_temp[diss_temp$genderIndicators==0] givenNames_part2 <- findGivenNames(missing$author12) # send only requests for missing names save(givenNames_part2, file="givenNames_part2.RData") givenNames <- rbind(givenNames_part1, givenNames_part2) # apply genderDB 2nd version genderized2 <- genderize(firstnames, genderDB = givenNames, progress = TRUE) table(genderized2$genderIndicators) colnames(genderized2)[1] <- "firstnames" diss_temp <- cbind(diss[,1:22], genderized2) # merge table(diss_temp$gender, diss_temp$PY) # male-female by year # wait for 24 hours missing <- diss_temp[diss_temp$genderIndicators==0] givenNames_part3 <- findGivenNames(missing$author12) # send only requests for missing names save(givenNames_part3, file="givenNames_part3.RData") givenNames <- rbind(givenNames_part1, givenNames_part2, givenNames_part3) # apply genderDB 3rd version genderized3 <- genderize(firstnames, genderDB = givenNames, progress = TRUE) table(genderized3$genderIndicators) colnames(genderized3)[1] <- "firstnames" diss_temp <- cbind(diss, genderized3) # merge table(diss_temp$gender, diss_temp$PY) # male-female by year sum(table(diss_temp$gender)) # wait for 24 hours missing <- diss_temp[diss_temp$genderIndicators==0] givenNames_part4 <- findGivenNames(missing$author12) # send only requests for missing names save(givenNames_part4, file="givenNames_part4.RData") givenNames <- rbind(givenNames_part1, givenNames_part2, givenNames_part3, givenNames_part4) # apply genderDB 4th version genderized4 <- genderize(firstnames, genderDB = givenNames, progress = TRUE) table(genderized4$genderIndicators) colnames(genderized4)[1] <- "firstnames" diss_temp <- cbind(diss, genderized4) # merge table(diss_temp$gender, diss_temp$PY) # male-female by year sum(table(diss_temp$gender)) # wait for 24 hours missing <- diss_temp[diss_temp$genderIndicators==0] givenNames_part5 <- findGivenNames(missing$author12) # send only requests for missing names save(givenNames_part5, file="givenNames_part5.RData") givenNames <- rbind(givenNames_part1, givenNames_part2, givenNames_part3, givenNames_part4, givenNames_part5) # apply genderDB 5th version genderized5 <- genderize(firstnames, genderDB = givenNames, progress = TRUE) table(genderized5$genderIndicators) colnames(genderized5)[1] <- "firstnames" diss_temp <- cbind(diss, genderized5) # merge table(diss_temp$gender, diss_temp$PY) # male-female by year sum(table(diss_temp$gender)) # wait for 24 hours missing <- diss_temp[diss_temp$genderIndicators==0] givenNames_part6 <- findGivenNames(missing$author12) # send only requests for missing names save(givenNames_part6, file="givenNames_part6.RData") givenNames <- rbind(givenNames_part1, givenNames_part2, givenNames_part3, givenNames_part4, givenNames_part5, givenNames_part6) save(givenNames, file ="givenNames.RData") # apply genderDB 6th version genderized6 <- genderize(firstnames, genderDB = givenNames, progress = TRUE) table(genderized6$genderIndicators) colnames(genderized6)[1] <- "firstnames" diss_temp <- cbind(diss, genderized6) # merge table(diss_temp$gender, diss_temp$PY) # male-female by year sum(table(diss_temp$gender)) # uncertain cases uncertain <- givenNames[as.numeric(givenNames$probability) <= 0.95,] colnames(uncertain)[1] <- "givenName" # unisex first names, list created from own research unisex <- read.csv("unisex_vornamen.txt", header = FALSE, stringsAsFactors = FALSE) unisex$unisex <- 1 colnames(unisex)[1] <- "givenName" unisex[,1] <- quanteda::char_tolower(unisex[,1]) # subset of uncertain cases and unisex names in diss dataset marker <- dplyr::full_join(unisex, uncertain, by = "givenName")[,c(1,2,4)] diss_marker <- dplyr::left_join(diss_temp, marker, by = "givenName") diss_marker$unisex <- as.numeric(diss_marker$unisex) # manually check marked cases and code cases with no gender Information provided by genderizeR write.csv2(diss_marker[!is.na(diss_marker$unisex), c(2,10)], file = "marker1.csv", row.names = FALSE) # unisex write.csv2(diss_marker[!is.na(diss_marker$probability) & is.na(diss_marker$unisex), c(2,10)], file = "marker2.csv", row.names = FALSE) # low probability (without checked unisex cases) write.csv2(missing[,c(2,18)], file = "marker3.csv", row.names = FALSE) # missing cases write.csv2(diss_marker[diss_marker$genderIndicators==0 & is.na(diss_marker$unisex), c(2,18)], file = "marker3.csv", row.names = FALSE) # missing cases # internet research on author information, manually coded new variable "gender_checked" # male = 0, female = 1, unclear = 2 gender_check1 <- read.csv("marker1.csv", fileEncoding = "UTF-8", colClasses = "character") gender_check2 <- read.csv("marker2.csv", fileEncoding = "UTF-8", colClasses = "character") gender_check3 <- read.csv("marker3.csv", fileEncoding = "UTF-8", colClasses = "character") names(gender_check3)[2] <- "author" gender_check <- rbind(gender_check1, gender_check2, gender_check3) table(gender_check$gender_checked) # check for unexpected multiple occurences max(table(gender_check$DFK)) # merge with diss_marker diss_final <- dplyr::left_join(diss_marker, gender_check[,c(1,3)], by = "DFK") table(diss_final$gender, diss_final$gender_checked) # see how often genderizeR was wrong table(diss_final$gender) diss_final$gender <- ifelse(!is.na(diss_final$gender_checked) & diss_final$gender_checked == "2", "unknown", diss_final$gender) diss_final$gender <- ifelse(!is.na(diss_final$gender_checked) & diss_final$gender_checked == "1", "female", diss_final$gender) diss_final$gender <- ifelse(!is.na(diss_final$gender_checked) & diss_final$gender_checked == "0", "male", diss_final$gender) table(diss_final$gender) diss <- diss_final # keep unknown cases in diss dataset diss_final <- diss_final[diss_final$gender!="unknown",] # drop unknown cases for final dataset ### descriptives ---- table(diss_final$gender) # absolute values round(table(diss_final$gender) / sum(table(diss_final$gender)), 3)*100 # relative values table(diss_final$PY) table(diss_final$gender, diss_final$PY) # absolute values gender_by_year <- round(table(diss_final$gender, diss_final$PY) / rbind(table(diss_final$PY),table(diss_final$PY)), 3)*100 # relative values # first semester students, data from destatis destatis <- read.csv2("destatis_Studienanfaenger_140218.csv", stringsAsFactors=FALSE) destatis <- destatis[1:20, c(1,6)] destatis[,2] <- destatis[,2]*100 names(destatis) <- c("Jahr", "Beginner") # number of dissertations, data from destatis, taken from Antoni (2019) ant1 <- 2000:2016 ant2 <- c(144/301, 197/351, 167/317, 195/339, 189/319, 292/486, 272/419, 276/449, 330/512, 304/490, 347/509, 356/510, 390/580, 392/585, 378/539, 383/544, 429/598) antoni <- rbind(ant1, ant2) rm(ant1); rm(ant2) antoni <- as.data.frame(t(antoni)) names(antoni) <- c("Jahr", "Promotionen Deutschland") antoni[,2] <- antoni[,2]*100 # data from this study female_by_year <- as.data.frame(gender_by_year[1,]) female_by_year$Jahr <- as.numeric(rownames(female_by_year)) rownames(female_by_year) <- NULL colnames(female_by_year)[1] <- "Promotionen" # join data for plot quotaplot <- dplyr::left_join(female_by_year, destatis, by = "Jahr") quotaplot <- dplyr::left_join(quotaplot, antoni, by = "Jahr") # plot female shares by year plot(quotaplot[,1], type = "l", xlab = "", ylab = "Prozent", ylim = c(0, 90), lwd = 2, main = "Frauenanteile je Jahr (1968–2017)", xaxt = "n") lines(quotaplot[,4], type = "l", lty = "dotted", lwd = 2) lines(quotaplot[,3], type = "l", lty = "dashed", lwd = 2) axis(1, at = seq(3, 48, 5), labels = seq(1970, 2015, 5)) #grid(NA, NULL, lty = 6, col = "lightgrey") abline(v = seq(3, 48, 5), col = "lightgrey") abline(h = 50, col = "lightgrey") legend(30, 17, c("Promotionen im Datensatz", "Promotionen Deutschland", "Erstsemester Deutschland"), lty = c("solid", "dotted", "dashed"), lwd = 2) # lm of female share summary(lm(quotaplot[,1] ~ quotaplot[,2])) # plot male-female publication by year # 2017 is left out since by 12/2018 not all dissertations were indexed with controlled terms plot(table(diss_final$gender, diss_final$PY)[1,1:49], type = "l", xlab = "", ylab = "", main = "In PSYNDEX nachgewiesene Promotionen", xaxt = "n") axis(1, at = 1:49, labels = 1968:2016) lines(table(diss_final$gender, diss_final$PY)[2,1:49], type = "l", lty = "dashed") grid(NA, NULL, lty = 6, col = "lightgrey") legend(2, 400, c("Frauen", "Männer"), lty = c("solid", "dashed"))