the stream

2022/9/2 5:53

Downloaded OpenWebText today for some from-scratch language model (pre)training experiments! It's a bunch of small .xz files that unzip to more small .xz files, so I ended up writing a little script to automate all the folder-creating and unzipping, with a nice in-place-updating progress meter in the terminal:

std := import('std')
str := import('str')
fs := import('fs')
fmt := import('fmt')
debug := import('debug')

xzFiles := fs.listFiles('.') |> std.filter(fn(f) f.name |> str.endsWith?('.xz'))

xzFiles |> with std.each() fn(f, i) {
	name := f.name
	dirname := name |> str.trimEnd('_data.xz')
	print('\x1b[0F\x1b[2K\x1b[0G') // erase previous line
	fmt.format('Unzipping {{0}}/{{1}} {{2}}', i, len(xzFiles), f.name) |> print()

	mkdir(dirname) // assume infallible
	evt := exec('tar', ['-xf', name, '-C', dirname], '')
	if evt.status != 0 -> {
		fmt.printf('Error: {{0}}', evt.stderr)
		exit(evt.status)
	}
}