The disassembler code is where we define what instruction tokens to return and for what instruction. The following code is `CoolVM`'s instructions: ```c int64_t execute_instr(struct program_struct* prog, struct instr* instr) { int64_t var_18 = 0 int64_t var_10 = 0 uint32_t opcode = zx.d(instr->opcode) int64_t status if (opcode u> 0xa) { status = 1 } else { switch (opcode) { case 0 // mov *instr->operand1 = zx.d(instr->operand3) case 1 // push *prog->sp = *instr->operand2 prog->sp = prog->sp + 4 case 2 // pop prog->sp = prog->sp - 4 *instr->operand1 = *prog->sp case 3 // sub prog->reg4 = *instr->operand2 - *instr->operand1 case 4 // jmp not zero if (prog->reg4 != 0) { *prog = &prog->pc[zx.q(instr->operand3)] } case 5 // jmp not zero back if (prog->reg4 != 0) { *prog = &prog->pc[neg.q(zx.q(instr->operand3))] } case 6 // print putchar(c: zx.d(*instr->operand2)) case 7 // read read(fd: 0, buf: instr->operand1, nbytes: 1) case 8 // exit prog->exit = 1 case 9 // xor int64_t rax_59 = instr->operand1 *rax_59 = *rax_59 ^ zx.d(instr->operand3) case 0xa // exit if not zero prog->reg4 = *instr->operand2 - *instr->operand1 if (prog->reg4 != 0) { prog->exit = 1 } } status = 0 } return status } ``` # Instructions `CoolVM` has - mov - push - pop - sub - jnz (jump not zero) - jnzb (jump not zero backwards) - exit - xor - enz (exit if not zero) Create a class to decide the instruction. In the case of `CoolVM`, the decoding is fairly simple. ```python class Instruction():     def __init__(self,data):         self.opcode = data[0]         self.op1 = self.op2 = self.op3 = None         for i,op in enumerate(["self.op1","self.op2"]):             if data[i+1] < 5:                 exec(op + '= f"r{data[i+1]}"')             elif data[i+1] == 5:                       exec(op + '= "pc"')             elif data[i+1] == 6:                       exec(op + '= "sp"')             else:                 exec(op + '= "BAD"')         self.op3 = data[3] ``` In our disassembler class, we can create a dictionary where the key is the opcode and the value is a two element list containing the mnemonic and the function that will return the instruction tokens. ```python     def __init__(self):         self.instructions = {             0: ["mov",self.reg_imm],             1: ["push",self.reg2],             2: ["pop",self.reg1],             3: ["sub",self.two_reg],             4: ["jnz",self.jnz],             5: ["jnzb",self.jnzb],             6: ["print",self.reg2],             7: ["read",self.reg1],             8: ["exit",self.exit],             9: ["xor",self.reg_imm],             10:["ene",self.ene],         } ``` With this dictionary, the disas function will grab the opcode and call the instructions function. ```python def disas(self,data,addr):         instr = Instruction(data)         mnem, func = self.instructions[instr.opcode]         return func(mnem,instr,addr) ``` We can also make a few simple templates for common patterns the instructions will have. The common patterns in `CoolVM` are - MNEN REG, IMM (one register, one immediate) - MNEM REG (one register, either operand 1 or operand 2) - MNEM REG, REG (two registers) Then there are a few that we will want to implement individually, such as the jumps. For the instruction token types, there are a ton to choose from depending on the data type of the token. Binja will render the tokens differently depending on the type. ```python class InstructionTextTokenType(enum.IntEnum): TextToken = 0 InstructionToken = 1 OperandSeparatorToken = 2 RegisterToken = 3 IntegerToken = 4 PossibleAddressToken = 5 BeginMemoryOperandToken = 6 EndMemoryOperandToken = 7 FloatingPointToken = 8 AnnotationToken = 9 CodeRelativeAddressToken = 10 ArgumentNameToken = 11 HexDumpByteValueToken = 12 HexDumpSkippedByteToken = 13 HexDumpInvalidByteToken = 14 HexDumpTextToken = 15 OpcodeToken = 16 StringToken = 17 CharacterConstantToken = 18 KeywordToken = 19 TypeNameToken = 20 FieldNameToken = 21 NameSpaceToken = 22 NameSpaceSeparatorToken = 23 TagToken = 24 StructOffsetToken = 25 StructOffsetByteValueToken = 26 StructureHexDumpTextToken = 27 GotoLabelToken = 28 CommentToken = 29 PossibleValueToken = 30 PossibleValueTypeToken = 31 ArrayIndexToken = 32 IndentationToken = 33 UnknownMemoryToken = 34 CodeSymbolToken = 64 DataSymbolToken = 65 LocalVariableToken = 66 ImportToken = 67 AddressDisplayToken = 68 IndirectImportToken = 69 ExternalSymbolToken = 70 ``` # One Reg, One Immediate - mov - xor This class of instructions uses `op1` as a reg and `op3` as a value. This function has no branching as well so the following definition works. (Note Obsidian spacing makes it look bad) ```python def reg_imm(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeperatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.IntegerToken,hex(instr.op3),instr.op3)) return tokens,[] ``` # One Reg Operand 1 - pop - read Simple one register is op1 ```python def reg1(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) return tokens,[] ``` # One Reg Operand 2 - push - print Simple one register is op1 ```python def reg2(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op2)) return tokens,[] ``` # Two Reg - sub Just two reg tokens for op1 and op2. only used for one instruction so could also name this sub. ```python def two_reg(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeperatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op2)) return tokens,[] ``` # JNZ Jump if not zero is similar to one reg, one imm except the immediate is an address relative to the current address. Also there is branch information returned. For reference to the `BranchInfo` class read: [[Architecture Plugin#get_instruction_info]] ```python def jnz(self,mnem,instr,addr): target = instr.op3 + addr + 4 tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeperatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.AddressDisplayToken,hex(target),target)) true_branch = BranchInfo(BranchType.TrueBranch,target) false_branch = BranchInfo(BranchType.FalseBranch,addr +4) return tokens,[true_branch,false_branch] ``` # JNZB Jump if not zero backwards is the same has JNZ except that we subtract instread of add the target. ```python def jnzb(self,mnem,instr,addr): target = addr - instr.op3 + 4 tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeperatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.AddressDisplayToken,hex(target),target)) true_branch = BranchInfo(BranchType.TrueBranch,target) false_branch = BranchInfo(BranchType.FalseBranch,addr+4) return tokens,[true_branch,false_branch] ``` # Exit Exit is simple because it takes no operands however we do have to add a branch type `FunctionReturn` to tell binja to stop disassembling when reaching this instruction. ```python def exit(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem)] exit_branch = BranchInfo(BranchType.FunctionReturn) return tokens,[exit_branch] ``` # Enz Exit if not zero is simple as it copies the same formula as `two_reg` plus has branch info. I chose to use TrueBranch because the program either exits or goes to the next instruction, but for exit we can just leave off and branch to anywhere. The exit will be implemented in the lifter. ```python def enz(self,mnem,instr,addr): tokens,_ = self.two_reg(mnem,instr,addr) exit_branch = BranchInfo(BranchType.TrueBranch,addr+4) return tokens,[exit_branch] ``` # Results A snippet of the graph view of the disassembly: ![[basic_blocks.png]] # Code Can be found at [coolvm_binja/disassembler.py at master · thisusernameistaken/coolvm_binja (github.com)](https://github.com/thisusernameistaken/coolvm_binja/blob/master/disassembler.py) ```python from binaryninja import ( InstructionTextToken, InstructionTextTokenType, BranchType ) class BranchInfo: def __init__(self,_type,target=None): self.type = _type self.target = target class Instruction(): def __init__(self,data): self.opcode = data[0] self.op1 = self.op2 = self.op3 = None for i,op in enumerate(["self.op1","self.op2"]): if data[i+1] < 5: exec(op + '= f"r{data[i+1]}"') elif data[i+1] == 5: exec(op + '= "pc"') elif data[i+1] == 6: exec(op + '= "sp"') else: exec(op + '= "BAD"') self.op3 = data[3] class CoolVMDisassembler(): def __init__(self): self.instructions = { 0: ["mov",self.reg_imm], 1: ["push",self.reg2], 2: ["pop",self.reg1], 3: ["sub",self.two_reg], 4: ["jnz",self.jnz], 5: ["jnzb",self.jnzb], 6: ["print",self.reg2], 7: ["read",self.reg1], 8: ["exit",self.exit], 9: ["xor",self.reg_imm], 10:["ene",self.ene], } def disas(self,data,addr): instr = Instruction(data) mnem, func = self.instructions[instr.opcode] return func(mnem,instr,addr) def reg_imm(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeparatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.IntegerToken,hex(instr.op3),instr.op3)) return tokens,[] def reg1(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) return tokens,[] def reg2(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op2)) return tokens,[] def two_reg(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeparatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op2)) return tokens,[] def jnz(self,mnem,instr,addr): target = instr.op3 + addr + 4 tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeparatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.AddressDisplayToken,hex(target),target)) true_branch = BranchInfo(BranchType.TrueBranch,target) false_branch = BranchInfo(BranchType.FalseBranch,addr +4) return tokens,[true_branch,false_branch] def jnzb(self,mnem,instr,addr): target = addr + 4 - instr.op3 tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem+" ")] tokens.append(InstructionTextToken(InstructionTextTokenType.RegisterToken,instr.op1)) tokens.append(InstructionTextToken(InstructionTextTokenType.OperandSeparatorToken,", ")) tokens.append(InstructionTextToken(InstructionTextTokenType.AddressDisplayToken,hex(target),target)) true_branch = BranchInfo(BranchType.TrueBranch,target) false_branch = BranchInfo(BranchType.FalseBranch,addr+4) return tokens,[true_branch,false_branch] def exit(self,mnem,instr,addr): tokens = [InstructionTextToken(InstructionTextTokenType.InstructionToken,mnem)] exit_branch = BranchInfo(BranchType.FunctionReturn) return tokens,[exit_branch] def ene(self,mnem,instr,addr): tokens,_ = self.two_reg(mnem,instr,addr) exit_branch = BranchInfo(BranchType.TrueBranch,addr+4) return tokens,[exit_branch] ``` ```