romane <- austen_books()
#create the dfm
romane_dfm_sw <- dfm(romane$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
## Document-feature matrix of: 73,422 documents, 15,048 features (99.97% sparse) and 0 docvars.
## features
## docs sense sensibility jane austen chapter family dashwood long settled
## text1 1 1 0 0 0 0 0 0 0
## text2 0 0 0 0 0 0 0 0 0
## text3 0 0 1 1 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0
## text5 0 0 0 0 0 0 0 0 0
## text6 0 0 0 0 0 0 0 0 0
## features
## docs sussex
## text1 0
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## [ reached max_ndoc ... 73,416 more documents, reached max_nfeat ... 15,038 more features ]
romane_dfm_trim <- romane_dfm_sw %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)

roman_emma <- filter(romane, book == "Emma")
roman_emma_dfm <- dfm(roman_emma$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
roman_emma_dfm_trim <- roman_emma_dfm %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)

3.3 Erstelle eine Comparison Wordcloud mit deinem gesamten Korpus.
roman_emma <- filter(romane, book == "Emma")
roman_sense <- filter(romane, book == "Sense & Sensibility")
roman_pride <- filter(romane, book == "Pride & Prejudice")
roman_mansfield <- filter(romane, book == "Mansfield Park")
roman_northanger <- filter(romane, book == "Northanger Abbey")
roman_persuasion <- filter(romane, book == "Persuasion")
roman_emma_dfm <- dfm(roman_emma$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
roman_sense_dfm <- dfm(roman_sense$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
roman_pride_dfm <- dfm(roman_pride$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
roman_mansfield_dfm <- dfm(roman_mansfield$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
roman_northanger_dfm <- dfm(roman_northanger$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
roman_persuasion_dfm <- dfm(roman_persuasion$text,
tolower = T,
remove_punct = TRUE,
remove_numbers = T,
# Romane als df binden
# this does the same as this:
# romane <- austen_books()
jane <- rbind(roman_emma, roman_sense, roman_pride, roman_mansfield, roman_northanger, roman_persuasion) %>%
jane_corp <- corpus(jane,
text_field = "text")
# um eine comparison wordcloud zu erstellen muss man zunächst ein df haben mit allen texten
# mit einer klar definierten text spalte.
# dann muss man dieses df mit der function corpus an ein objekt binden und der funktion das
# text feld angeben.
# danach mit corpus_subset die Wordcloud erstellen.
# Diese Funktion nimmt mehrere Argumente: zunächst den corpus dann muss man angeben anhand
# welcher metadaten oder spalten (e.g. buchtitel) das ganze geplottet werden soll.
# tokens wird genutzt um punktiation zu filtern und tokens_remove nimmt die funktion
# stopwords mit dem stopword set an.
# Das wird dann in ein dfm geschrieben und anhand von dem vector, der Vorher erstellt wurde,
# groupiert. Danach wird das dfm getrimmt mit der frequenz der Wörter und verbose(?).
# Danach wird das ganze dann in die worldcloud function gegeben mit dem Argument comparison = "TRUE"
book %in% c("Emma", "Sense & Sensibility", "Pride & Prejudice", "Mansfield Park", "Northanger Abbey", "Persuasion")) %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(stopwords("english")) %>%
dfm() %>%
# groups kann nur bis zu 7 verschiedene Gruppen übernehmen. haben 6 Bücher daher keine Problematik
dfm_group(groups = book) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE) %>%
textplot_wordcloud(comparison = TRUE)

# wordclouds kann nur bis zu 7 verschiedene Groupierungen übernehmen
# das dfm muss die kriterien erfüllen -> wenn mehr als 7n vescheinde texte -> muss groupiert werden
janebigrams <- romane %>% unnest_tokens(ngram, "text", token = "ngrams", n = 3)
janebigrams.getrennt <- janebigrams %>% separate(ngram, c("Wort1", "Wort2", "Wort3"), sep = " ")
Stoppwörter <- stopwords(language = "en")
janebigrams.woSW <- janebigrams.getrennt %>%
filter(!Wort1 %in% Stoppwörter) %>%
filter(!Wort2 %in% Stoppwörter)
# new bigram counts: <- janebigrams.woSW %>%
count(Wort1, Wort2, sort = TRUE) %>%
filter(n > 40) # hier mal andere Werte einsetzen
ggraph(, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

emma.bigrams <- janebigrams.woSW %>%
filter(Wort3 == "emma") %>%
count(Wort1, Wort2, Wort3, sort = TRUE) %>%
filter(n > 0)
elizabeth.bigrams <- janebigrams.woSW %>%
filter(Wort3 == "elizabeth") %>%
count(Wort1, Wort2, Wort3, sort = TRUE) %>%
filter(n > 0)
ggraph(emma.bigrams, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

ggraph(elizabeth.bigrams, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

5.1 Erstelle zwei Topic Models mit einer unterschiedlichen Anzahl von Topics mit deinem gesamten Korpus.
words_to_kick <- c("mrs", "mr", "miss", "lady", "something") %>%
colnames(words_to_kick) <- c("word")
# vllt in anti join df needed
# versuchen nochmal zuhause
janetokens <- austen_books() %>%
unnest_tokens(output = word, input = text) %>%
anti_join(words_to_kick, by = "word")
janedfm.ohneSW <- dfm(janetokens$word,
tolower = T,
remove_punct = T,
#stm (structural topic modeling) Befehl geben
## some how my R session allways runns into an error on this line
## aber auch nur auf meinem Mac ?????
#JANE_TM <- stm(janedfm.ohneSW, K = 10,
# verbose = FALSE)
#in tidy format bringen
#JANE_TM_tidy <- tidy(JANE_TM)
#JANE_TM_tidy %>%
# group_by(topic) %>%
# top_n(10, beta) %>%
# ungroup() %>%
# mutate(topic = paste0("Topic ", topic),
# term = reorder_within(term, beta, topic)) %>%
# ggplot(aes(term, beta, fill = as.factor(topic))) +
# geom_col(alpha = 0.8, show.legend = FALSE) +
# facet_wrap(~ topic, scales = "free_y") +
# coord_flip() +
# scale_x_reordered() +
# labs(x = NULL, y = expression(beta),
# title = "Höchsten Wortwahrscheinlichkeiten nach Topic")
Sentiment using hp example and austen books
# example
#tidyHP <- HP %>%
# groups the data based on the columnd "Band" in the HP dataframe
# group_by(Band) %>%
# creates a ned colum that is called linenumber based on the row_number in the
# Hp dataset (which is allready grouped)
# mutate(
# linenumber = row_number()) %>%
# then she ungroups the whole thig again
# ungroup() %>%
# unnest_tokens(word, Text)
#HP_sentiment <- tidyHP %>%
# inner_join(get_sentiments("bing")) %>%
# count(Band, index = linenumber, sentiment) %>%
# pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
# mutate(sentiment = positive - negative)
# end example
tidyRomane <- romane %>%
group_by(book) %>%
linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word, text)
Romane_sentiment_nrc <- tidyRomane %>%
# inner_join gives back the rows (words) which are also in the
# bing data and gives back the positive or negative sentiment of the
# word
# bing is a dataframe containing one column with the word (n = 6,786)
# and the sentiment of the individual word
# we then match, with innerjoin, the words that are in the book and
# in the dataset of Bing Liu et. al.
# we are joining by the column word because that is the column that is
# in both dataframes
inner_join(get_sentiments("nrc")) %>%
# counts the
count(book, index = linenumber, sentiment) %>%
# widens the dataframe using the different strings in
# sentiment (defined by names_from) and puts in the coresponding values
# from the column n
# for the empty datapoints we set in 0
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
# adds a sentiment colum which is just the positive minus the negative
# sentiment of each line charged
mutate(sentiment = positive - negative)
Romane_sentiment_bing <- tidyRomane %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(Romane_sentiment_nrc, aes(index, sentiment, fill = book)) +
# type of graph to be used
geom_col(show.legend = FALSE) +
# first all the data is in one graph
# this function divides them based on the book
# ncol determines how many number of grpahs should be put into one column
# scales, the data can be freely scaled along the x axis
facet_wrap(~book, ncol = 3, scales = "free_x")

ggplot(Romane_sentiment_bing, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 3, scales = "free_x")

# scale_fill_brewer to use pallets of colors