library(janeaustenr)
library(dplyr)
library(quanteda)

romane <- austen_books()

#create the dfm
romane_dfm_sw <- dfm(romane$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
                   )
show(romane_dfm_sw)
## Document-feature matrix of: 73,422 documents, 15,048 features (99.97% sparse) and 0 docvars.
##        features
## docs    sense sensibility jane austen chapter family dashwood long settled
##   text1     1           1    0      0       0      0        0    0       0
##   text2     0           0    0      0       0      0        0    0       0
##   text3     0           0    1      1       0      0        0    0       0
##   text4     0           0    0      0       0      0        0    0       0
##   text5     0           0    0      0       0      0        0    0       0
##   text6     0           0    0      0       0      0        0    0       0
##        features
## docs    sussex
##   text1      0
##   text2      0
##   text3      0
##   text4      0
##   text5      0
##   text6      0
## [ reached max_ndoc ... 73,416 more documents, reached max_nfeat ... 15,038 more features ]
library(quanteda.textplots)

romane_dfm_trim <- romane_dfm_sw %>%
    dfm_trim(min_termfreq = 20, verbose = FALSE)

textplot_wordcloud(romane_dfm_trim)

roman_emma <- filter(romane, book == "Emma")

roman_emma_dfm <- dfm(roman_emma$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
                   )

roman_emma_dfm_trim <- roman_emma_dfm %>%
      dfm_trim(min_termfreq = 20, verbose = FALSE)

textplot_wordcloud(roman_emma_dfm_trim)

3.3 Erstelle eine Comparison Wordcloud mit deinem gesamten Korpus.

roman_emma <- filter(romane, book == "Emma")
roman_sense <- filter(romane, book == "Sense & Sensibility")
roman_pride <- filter(romane, book == "Pride & Prejudice")
roman_mansfield <- filter(romane, book == "Mansfield Park")
roman_northanger <- filter(romane, book == "Northanger Abbey")
roman_persuasion <- filter(romane, book == "Persuasion")

roman_emma_dfm <- dfm(roman_emma$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
)

roman_sense_dfm <- dfm(roman_sense$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
                   )


roman_pride_dfm <- dfm(roman_pride$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
                   )

roman_mansfield_dfm <- dfm(roman_mansfield$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
                   )

roman_northanger_dfm <- dfm(roman_northanger$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
                   )

roman_persuasion_dfm <- dfm(roman_persuasion$text,
                   tolower = T,
                   remove_punct = TRUE,
                   remove_numbers = T,
                   stopwords("en")
                   )

# Romane als df binden
# this does the same as this:
# romane <- austen_books()
jane <- rbind(roman_emma, roman_sense, roman_pride, roman_mansfield, roman_northanger, roman_persuasion) %>% as.data.frame()

View(jane)

jane_corp <- corpus(jane, 
                 text_field = "text")

# um eine comparison wordcloud zu erstellen muss man zunächst ein df haben mit allen texten
# mit einer klar definierten text spalte.
# dann muss man dieses df mit der function corpus an ein objekt binden und der funktion das 
# text feld angeben.
# danach mit corpus_subset die Wordcloud erstellen.
# Diese Funktion nimmt mehrere Argumente: zunächst den corpus dann muss man angeben anhand
# welcher metadaten oder spalten (e.g. buchtitel) das ganze geplottet werden soll.
# tokens wird genutzt um punktiation zu filtern und tokens_remove nimmt die funktion
# stopwords mit dem stopword set an.
# Das wird dann in ein dfm geschrieben und anhand von dem vector, der Vorher erstellt wurde, 
# groupiert. Danach wird das dfm getrimmt mit der frequenz der Wörter und verbose(?).
# Danach wird das ganze dann in die worldcloud function gegeben mit dem Argument comparison = "TRUE"
library(quanteda.textplots)
corpus_subset(jane_corp, 
              book %in% c("Emma", "Sense & Sensibility", "Pride & Prejudice", "Mansfield Park", "Northanger Abbey", "Persuasion")) %>%
    tokens(remove_punct = TRUE) %>%
    tokens_remove(stopwords("english")) %>%
    dfm() %>%
  # groups kann nur bis zu 7 verschiedene Gruppen übernehmen. haben 6 Bücher daher keine Problematik
    dfm_group(groups = book) %>%
    dfm_trim(min_termfreq = 20, verbose = FALSE) %>%
    textplot_wordcloud(comparison = TRUE)

# wordclouds kann nur bis zu 7 verschiedene Groupierungen übernehmen
# das dfm muss die kriterien erfüllen -> wenn mehr als 7n vescheinde texte -> muss groupiert werden
library(tidytext)
library(tidyverse)

janebigrams <- romane %>% unnest_tokens(ngram, "text", token = "ngrams", n = 3)

janebigrams.getrennt <- janebigrams %>% separate(ngram, c("Wort1", "Wort2", "Wort3"), sep = " ")

Stoppwörter <- stopwords(language = "en") 

janebigrams.woSW <- janebigrams.getrennt %>%
  filter(!Wort1 %in% Stoppwörter) %>%
  filter(!Wort2 %in% Stoppwörter)

# new bigram counts:
janebigrams.top <- janebigrams.woSW %>% 
  count(Wort1, Wort2, sort = TRUE) %>% 
  filter(n > 40) # hier mal andere Werte einsetzen
library(ggraph)
ggraph(janebigrams.top, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

library(dplyr)
emma.bigrams <- janebigrams.woSW  %>%
  filter(Wort3 == "emma") %>% 
  count(Wort1, Wort2, Wort3, sort = TRUE) %>% 
  filter(n > 0)

elizabeth.bigrams <- janebigrams.woSW  %>%
  filter(Wort3 == "elizabeth") %>% 
  count(Wort1, Wort2, Wort3, sort = TRUE) %>% 
  filter(n > 0)

library(ggraph)
ggraph(emma.bigrams, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

ggraph(elizabeth.bigrams, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

5.1 Erstelle zwei Topic Models mit einer unterschiedlichen Anzahl von Topics mit deinem gesamten Korpus.

library(quanteda)
library(tidytext)
library(dplyr)
library(stm)
library(reshape2)
library(tidytext)

words_to_kick <- c("mrs", "mr", "miss", "lady", "something") %>% as.data.frame()
colnames(words_to_kick) <- c("word")

# vllt in anti join df needed
# versuchen nochmal zuhause

janetokens <- austen_books() %>%
  unnest_tokens(output = word, input = text) %>%
  anti_join(words_to_kick, by = "word") 



janedfm.ohneSW <- dfm(janetokens$word, 
                     tolower = T,
                     remove_punct = T,
                     stopwords("en")) 


#stm (structural topic modeling) Befehl geben

## some how my R session allways runns into an error on this line
## aber auch nur auf meinem Mac ?????

#JANE_TM <- stm(janedfm.ohneSW, K = 10, 
 #                  verbose = FALSE)

#in tidy format bringen
#JANE_TM_tidy <- tidy(JANE_TM)

#plotten
#library(ggplot2)

#JANE_TM_tidy %>%
#    group_by(topic) %>%
#    top_n(10, beta) %>%
#    ungroup() %>%
#    mutate(topic = paste0("Topic ", topic),
#           term = reorder_within(term, beta, topic)) %>%
#    ggplot(aes(term, beta, fill = as.factor(topic))) +
#    geom_col(alpha = 0.8, show.legend = FALSE) +
#    facet_wrap(~ topic, scales = "free_y") +
#    coord_flip() +
#    scale_x_reordered() +
#    labs(x = NULL, y = expression(beta),
#         title = "Höchsten Wortwahrscheinlichkeiten nach Topic")

Sentiment using hp example and austen books

library(tidytext)
library(dplyr)
library(stringr)
library(tidyr)
library(textdata)
library(ggplot2)

# example

#tidyHP <- HP %>% 
  # groups the data based on the columnd "Band" in the HP dataframe
#  group_by(Band) %>%
  # creates a ned colum that is called linenumber based on the row_number in the 
  # Hp dataset (which is allready grouped)
#  mutate(
#    linenumber = row_number()) %>%
  # then she ungroups the whole thig again
#  ungroup() %>%
 # unnest_tokens(word, Text)

#HP_sentiment <- tidyHP %>%
#  inner_join(get_sentiments("bing")) %>%
#  count(Band, index = linenumber, sentiment) %>%
#  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
#  mutate(sentiment = positive - negative)

# end example

tidyRomane <- romane %>% 
  group_by(book) %>%
  mutate(
    linenumber = row_number()) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Romane_sentiment_nrc <- tidyRomane %>%
  # inner_join gives back the rows (words) which are also in the 
  # bing data and gives back the positive or negative sentiment of the 
  # word
  
  # bing is a dataframe containing one column with the word (n = 6,786)
  # and the sentiment of the individual word
  
  # we then match, with innerjoin, the words that are in the book and
  # in the dataset of Bing Liu et. al.
  
  # we are joining by the column word because that is the column that is
  # in both dataframes
  inner_join(get_sentiments("nrc")) %>%
  # counts the 
  count(book, index = linenumber, sentiment)  %>%
  # widens the dataframe using the different strings in 
  # sentiment (defined by names_from) and puts in the coresponding values
  # from the column n 
  # for the empty datapoints we set in 0
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  # adds a sentiment colum which is just the positive minus the negative 
  # sentiment of each line charged
  mutate(sentiment = positive - negative)


Romane_sentiment_bing <- tidyRomane %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber, sentiment)  %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

ggplot(Romane_sentiment_nrc, aes(index, sentiment, fill = book)) +
  # type of graph to be used 
  geom_col(show.legend = FALSE) +
  # first all the data is in one graph
  # this function divides them based on the book
  # ncol determines how many number of grpahs should be put into one column
  # scales, the data can be freely scaled along the x axis
  facet_wrap(~book, ncol = 3, scales = "free_x")

ggplot(Romane_sentiment_bing, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 3, scales = "free_x")

# scale_fill_brewer to use pallets of colors