Downloaded OpenWebText today for some from-scratch language model (pre)training experiments! It's a bunch of small .xz files that unzip to more small .xz files, so I ended up writing a little script to automate all the folder-creating and unzipping, with a nice in-place-updating progress meter in the terminal:
std := import('std')
str := import('str')
fs := import('fs')
fmt := import('fmt')
debug := import('debug')
xzFiles := fs.listFiles('.') |> std.filter(fn(f) f.name |> str.endsWith?('.xz'))
xzFiles |> with std.each() fn(f, i) {
name := f.name
dirname := name |> str.trimEnd('_data.xz')
print('\x1b[0F\x1b[2K\x1b[0G') // erase previous line
fmt.format('Unzipping {{0}}/{{1}} {{2}}', i, len(xzFiles), f.name) |> print()
mkdir(dirname) // assume infallible
evt := exec('tar', ['-xf', name, '-C', dirname], '')
if evt.status != 0 -> {
fmt.printf('Error: {{0}}', evt.stderr)
exit(evt.status)
}
}