А
Size: a a a
А
АК
AS
А
АК
AS
АК
AS
ЕТ
MV
АК
library(jsonlite)
library(data.table)
in_file <- "Downloads/goodreads_books.json.gz"
book_out_file <- "Downloads/goodreads_books.csv.gz"
author_out_file <- "Downloads/goodreads_authors.csv.gz"
unlink(book_out_file)
unlink(author_out_file)
book_handler <- function(entry) {
res <- list(
book_id = entry[["book_id"]],
work_id = entry[["work_id"]],
title = entry[["title"]],
publication_year = entry[["publication_year"]],
num_pages = entry[["num_pages"]],
publisher = entry[["publisher"]]
)
fwrite(
x = res,
file = book_out_file,
append = TRUE,
sep = ",",
na = ""
)
}
stream_in(gzfile(in_file), book_handler, 1000)
author_handler <- function(entry) {
res <- entry[["authors"]]
res <- setNames(res, entry[["book_id"]])
res <- rbindlist(res, idcol = "book_id")
fwrite(
x = res,
file = author_out_file,
append = TRUE,
sep = ",",
na = ""
)
}
stream_in(gzfile(in_file), author_handler, 1000)
АК
e <- new.env()внутри хэндлера:
e[[entry[["book_id"]]]] <- entryПосле окончания процесса:
rbindlist(e. fill = TRUE)
AS
АК
zcat "Downloads/goodreads_books.json.gz" | head | jq -r '[.book_id, .work_id, .title] | @csv'
"5333265","5400751","W.C. Fields: A Life on Film"
"1333909","1323437","Good Harbor"
"7327624","8948723","The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)"
"6066819","6243154","Best Friends Forever"
"287140","278577","Runic Astrology: Starcraft and Timekeeping in the Northern Tradition"
"287141","278578","The Aeneid for Boys and Girls"
"378460","368291","The Wanting of Levine"
"6066812","701117","All's Fairy in Love and War (Avalon: Web of Magic, #8)"
"34883016","56135087","Playmaker: A Venom Series Novella"
"287149","278586","The Devil's Notebook"
АК
zcat "Downloads/goodreads_books.json.gz" | head | jq -r '[.book_id, .work_id, .title] | @csv'
"5333265","5400751","W.C. Fields: A Life on Film"
"1333909","1323437","Good Harbor"
"7327624","8948723","The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)"
"6066819","6243154","Best Friends Forever"
"287140","278577","Runic Astrology: Starcraft and Timekeeping in the Northern Tradition"
"287141","278578","The Aeneid for Boys and Girls"
"378460","368291","The Wanting of Levine"
"6066812","701117","All's Fairy in Love and War (Avalon: Web of Magic, #8)"
"34883016","56135087","Playmaker: A Venom Series Novella"
"287149","278586","The Devil's Notebook"
zcat "Downloads/goodreads_books.json.gz" | head | jq -r '[(.book_id), (.authors | map(.author_id) | join("|"))] | @csv'
"5333265","604031"
"1333909","626222"
"7327624","10333"
"6066819","9212"
"287140","149918"
"287141","3041852"
"378460","215594"
"6066812","19158"
"34883016","5807700"
"287149","2983296|40075"
АК
jq
есть пакет jqr
, где можно использовать что-то вроде:library(jsonlite)
library(jqr)
x <- gzfile(in_file) %>%
jq("{book_id, work_id, title}") %>%
combine() %>%
fromJSON()
str(x)
# 'data.frame': 2360655 obs. of 3 variables:
# $ book_id: chr "5333265" "1333909" "7327624" "6066819" ...
# $ work_id: chr "5400751" "1323437" "8948723" "6243154" ...
# $ title : chr "W.C. Fields: A Life on Film" "Good Harbor" "The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)" "Best Friends Forever" ...
jq
рекомендуется.AS
АК
AS
AS