Package Libs :: Module librecognition
[hide private]
[frames] | no frames]

Source Code for Module Libs.librecognition

  1  """ 
  2  (c) Immunity, Inc. 2004-2007 
  3   
  4   
  5  U{Immunity Inc.<http://www.immunityinc.com>} 
  6   
  7   
  8  Library for function recognizing 
  9   
 10  """ 
 11  __VERSION__ = '1.2' 
 12   
 13   
 14  from libanalyze import * 
 15  from libdatatype import * 
 16  from libstackanalyze import * 
 17  import binascii 
 18  import struct 
 19  import hashlib 
 20  import re 
 21  import string 
 22  import debugger 
 23  import csv 
 24  import os 
 25   
26 -class MultiCSVIterator:
27 - def __init__(self, dictionaries):
28 if not isinstance(dictionaries, list): 29 dictionaries = [ dictionaries ] 30 31 self.iterators = [] 32 self.fds = [] 33 self.idx = 0 34 for d in dictionaries: 35 try: 36 fd = open(d, "rb") 37 except: 38 fd = open(d, "w+b") 39 self.iterators.append(csv.reader(fd)) 40 self.fds.append(fd)
41 - def __iter__(self):
42 for i in range(0, self.idx+1): 43 self.fds[i].seek(0) 44 self.idx = 0 45 return self
46
47 - def __del__(self):
48 while self.iterators: 49 self.iterators.pop() 50 for fd in self.fds: 51 fd.close() 52 del self.fds
53
54 - def next(self):
55 try: 56 data = self.iterators[self.idx].next() 57 except StopIteration: 58 if len(self.iterators) > self.idx+1: 59 self.idx += 1 60 return self.next() 61 else: 62 raise StopIteration 63 #append the filename to each line 64 data.append(self.fds[self.idx].name) 65 return data
66
67 -class FunctionRecognition:
68 - def __init__(self, imm, dictionaryfiles=None):
69 """ 70 This class try to recognize a function using different methods 71 (address/signature/heuristic). 72 73 @type imm: Debbuger OBJECT 74 @param imm: Debbuger instance 75 76 @type dictionaryfiles: STRING|LIST 77 @param dictionaryfiles: Name, or list of names, of .dat files inside the Data folder, where're stored the function 78 patterns. Use an empty string to use all .dat files in Data folder. 79 """ 80 self.imm = imm 81 self.heuristicReferencesCache = {} 82 self.heuristicCache = {} 83 self.resolvCache = {} 84 85 if not dictionaryfiles: 86 dictionaryfiles = [] 87 for file in os.listdir("Data"): 88 if file[-4:] == ".dat": 89 dictionaryfiles.append(os.path.join("Data", file)) 90 self.dictionaries = MultiCSVIterator(dictionaryfiles)
91
92 - def resolvFunctionByAddress(self, address, heuristic=90):
93 """ 94 Look up into our dictionaries to find a function match. 95 96 @type address: DWORD 97 @param address: Address of the function to search 98 99 @type heuristic: INTEGER 100 @param heuristic: heuristic threasold to consider a real function match 101 102 @rtype: STRING 103 @return: a STRING with the function's real name or the given address if there's no match 104 """ 105 106 #cache the answers 107 if self.resolvCache.has_key(address): 108 return self.resolvCache[address] 109 110 #try the exact hash method 111 exact = self.makeFunctionHashExact(address) 112 for data in self.dictionaries: 113 if exact == data[4]: 114 self.resolvCache[address] = data[0] 115 break 116 117 #try the heuristic method 118 if not self.resolvCache.has_key(address): 119 ref = self.selectBasicBlock(address) 120 posThreshold = 0 121 posName = "" 122 for data in self.dictionaries: 123 #cut down the possibilities, because the performance, reproducing the BB selection and comparing the result 124 #XXX: it's not a perfect way, thinking of supporting version changes 125 if ref == data[1]: 126 perc = self.checkHeuristic(address, data[2], data[3]) 127 #self.imm.Log("similar to function %s in %d%%" % (data[0], perc)) 128 if perc >= heuristic and perc > posThreshold: 129 posThreshold = perc 130 posName = data[0] 131 if posName: 132 self.resolvCache[address] = posName 133 134 #cache the negative answer 135 if not self.resolvCache.has_key(address): 136 self.resolvCache[address] = "%08X" % address 137 138 return self.resolvCache[address]
139
140 - def checkHeuristic(self, address, reference, refFirstCall=[]):
141 """ 142 Check a given address with a precomputed hash of a function. 143 Return a percentage of match (you can use a threasold to consider a real match) 144 145 @type address: DWORD 146 @param address: Address of the function to compare 147 148 @type reference: STRING 149 @param reference: base64 representation of the compressed information about the function 150 151 @type refFirstCall: STRING 152 @param refFirstCall: the same, but following the function pointed by the first call in the first BB. 153 (OPTIONAL) 154 155 @rtype: INTEGER 156 @return: heuristic threasold to consider a real function match 157 """ 158 159 #self.imm.Log("checking heuristically: %08X" % address) 160 161 #do the hard work just one time 162 if self.heuristicCache.has_key(address): 163 cfg = self.heuristicCache[address] 164 else: 165 cfg = self.makeFunctionHashHeuristic(address) 166 self.heuristicCache[address] = cfg 167 168 #check reference against our cache 169 sha1 = hashlib.sha1(reference+refFirstCall).digest() 170 if self.heuristicReferencesCache.has_key(sha1): 171 refcfg = self.heuristicReferencesCache[sha1] 172 else: 173 #This's the reference hash to compare with (uncompress just once and cache the results) 174 #Decode each BB-hash 175 refcfg = [] 176 refcfg.append([]) 177 refcfg.append([]) 178 data = binascii.a2b_base64(reference) 179 for o in range(0,len(data),12): 180 (start, left, right) = struct.unpack("LLL",data[o:o+12]) 181 refcfg[0].append([ start, left, right ]) 182 if refFirstCall: 183 data = binascii.a2b_base64(refFirstCall) 184 for o in range(0,len(data),12): 185 (start, left, right) = struct.unpack("LLL",data[o:o+12]) 186 refcfg[1].append([ start, left, right ]) 187 self.heuristicReferencesCache[sha1] = refcfg 188 189 perc1 = self.compareHeuristic(cfg[0][:], refcfg[0][:]) 190 if cfg[1] or refcfg[1]: 191 perc2 = self.compareHeuristic(cfg[1][:], refcfg[1][:]) 192 #use the average 193 perc = (perc1 + perc2) / 2 194 else: 195 perc = perc1 196 197 return perc
198
199 - def compareHeuristic(self, cfg, refcfg):
200 #for tmp in cfg: 201 #self.imm.Log("check start: %08X - left: %08X - right: %08X" % (tmp[0],tmp[1],tmp[2])) 202 203 #for tmp in refcfg: 204 #self.imm.Log("ref start: %08X - left: %08X - right: %08X" % (tmp[0],tmp[1],tmp[2])) 205 206 diff = eq = 0 207 checked = [] 208 #Compare each BB-hash 209 for info in cfg: 210 bbeq = value = 0 211 for rinfo in refcfg: 212 tmp = 0 213 if info[0] == rinfo[0]: tmp += 1 214 if info[1] == rinfo[1]: tmp += 1 215 if info[2] == rinfo[2]: tmp += 1 216 if tmp > bbeq: 217 bbeq = tmp 218 value = rinfo 219 if bbeq == 3: break 220 try: 221 idx=refcfg.index(value) 222 refcfg.pop(idx) 223 except ValueError: 224 pass 225 #self.imm.Log("value %s not found in refcfg" % value) 226 eq += bbeq 227 diff += 3 - bbeq 228 229 #crossed check 230 for rinfo in refcfg: 231 bbeq = value = 0 232 for info in cfg: 233 tmp = 0 234 if info[0] == rinfo[0]: tmp += 1 235 if info[1] == rinfo[1]: tmp += 1 236 if info[2] == rinfo[2]: tmp += 1 237 if tmp > bbeq: 238 bbeq = tmp 239 value = rinfo 240 if bbeq == 3: break 241 try: 242 idx=cfg.index(value) 243 cfg.pop(idx) 244 except ValueError: 245 pass 246 #self.imm.Log("value %s not found in cfg" % value) 247 eq += bbeq 248 diff += 3 - bbeq 249 250 #self.imm.Log("eq=%d, diff=%d" % (eq,diff)) 251 return eq * 100 / (eq + diff)
252
253 - def makeFunctionHashHeuristic(self, address, compressed = False, followCalls = True):
254 """ 255 Consider: 256 - Control Flow Graph 257 - generalized instructions that: 258 access memory/write memory/use registers/use constant/call/jmp/jmc 259 and all his combinations. 260 - special case of functions with just 1 BB and a couple of calls (follow the first call) 261 262 @type address: DWORD 263 @param address: address of the function to hash 264 265 @type compressed: Boolean 266 @param compressed: return a compressed base64 representation or the raw data 267 268 @type followCalls: Boolean 269 @param followCalls: follow the first call in a single basic block function 270 271 @rtype: LIST 272 @return: the first element is described below and the second is the result of this same function but over the first 273 call of a single basic block function (if applies), each element is like this: 274 a base64 representation of the compressed version of each bb hash: 275 [4 bytes BB(i) start][4 bytes BB(i) 1st edge][4 bytes BB(i) 2nd edge] 276 0 <= i < BB count 277 or the same but like a LIST with raw data. 278 """ 279 280 f = self.imm.getFunction(address) 281 bbs = f.getBasicBlocks() 282 bbmap = {} 283 cfg = {} 284 285 #Make a control flow graph 286 for bb in bbs: 287 cfg[bb.getStart()] = bb.getEdges() 288 289 #Make a hash of each BB 290 for bb in bbs: 291 bbhash_data = [] 292 for op in bb.getInstructions(self.imm): 293 #take into account just information about the opcode 294 instr = [] 295 instr.append(op.getMemType()) 296 instr.append(op.indexed) 297 instr.append(op.getCmdType()) 298 instr.append(op.optype[0]) 299 instr.append(op.optype[1]) 300 instr.append(op.optype[2]) 301 instr.append(op.getSize()) 302 bbhash_data.append(self.hash_a_list(instr)) 303 bbhash = self.hash_a_list(bbhash_data) 304 bbmap[bb.getStart()] = bbhash 305 306 #Replace BB addresses with hashes 307 rcfg = [] 308 for start,edges in cfg.iteritems(): 309 rstart = 0 310 redges = [0, 0] 311 rstart = bbmap[start] 312 if bbmap.has_key(edges[0]): 313 redges[0] = bbmap[edges[0]] 314 if bbmap.has_key(edges[1]): 315 redges[1] = bbmap[edges[1]] 316 rcfg.append([ rstart,redges[0],redges[1] ]) 317 318 #special case for functions with just one basic block and one or more calls 319 firstcall = [] 320 if followCalls and len(bbs) == 1 and len(bbs[0].getCalls()) > 0: 321 #we follow the first call and do the same work there, but avoiding recursion 322 #XXX: why the first? 323 op = self.imm.Disasm(bbs[0].getCalls()[0]) 324 if op.getJmpConst(): 325 firstcall = self.makeFunctionHashHeuristic(op.getJmpConst(), compressed, followCalls=False)[0] 326 #self.imm.Log("following first call to: %08X" % op.getJmpConst()) 327 del op 328 329 del bbs 330 del f 331 rcfg.sort() 332 333 if compressed: 334 #make the final hash 335 fhash = "" 336 for data in rcfg: 337 #[4 bytes BB(i) start][4 bytes BB(i) 1st edge][4 bytes BB(i) 2nd edge] 338 fhash += struct.pack("LLL", data[0], data[1], data[2]) 339 return [ binascii.b2a_base64(fhash)[:-1], firstcall ] 340 else: 341 return [ rcfg, firstcall ]
342
343 - def hash_a_list(self,data):
344 """ 345 Take a list and return a binary representation of his CRC32. 346 347 @type data: LIST 348 @param data: a list of elements to make the hash 349 350 @rtype: UNSIGNED LONG 351 @return: a hash of the given values 352 """ 353 354 ret = 0 355 for elem in data: 356 ret = binascii.crc32(str(elem), ret) 357 return struct.unpack("L", struct.pack("l",ret))[0]
358
359 - def searchFunctionByHeuristic(self, csvline, heuristic = 90, module = None):
360 """ 361 Search memory to find a function that fullfit the options. 362 363 @type csvline: STRING 364 @param csvline: A line of a Data CSV file. This's a simple support for copy 'n paste from a CSV file. 365 366 @type heuristic: INTEGER 367 @param heuristic: heuristic threasold to consider a real function match 368 369 @type module: STRING 370 @param module: name of a module to restrict the search 371 372 @rtype: LIST 373 @return: a list of tuples with possible function's addresses and the heauristic match percentage 374 """ 375 376 line = csv.reader([csvline]).next() 377 if len(line) < 9: line[7] = "" #support for older entries 378 return self._searchFunctionByHeuristic(line[1], line[2], line[3], line[4], heuristic, module, string.split(line[7],"|"))
379
380 - def _searchFunctionByHeuristic(self, search, functionhash=None, firstcallhash=None, exact=None, heuristic = 90, module = None, firstbb = None):
381 """ 382 Search memory to find a function that fullfit the options. 383 384 @type search: STRING 385 @param search: searchCommand string to make the first selection 386 387 @type functionhash: STRING 388 @param functionhash: the primary function hash (use makeFunctionHash to generate this value) 389 390 @type firstcallhash: STRING 391 @param firstcallhash: the hash of the first call on single BB functions (use makeFunctionHash to generate this value) 392 393 @type exact: STRING 394 @param exact: an exact function hash, this's a binary byte-per-byte hash (use makeFunctionHash to generate this value) 395 396 @type heuristic: INTEGER 397 @param heuristic: heuristic threasold to consider a real function match 398 399 @type module: STRING 400 @param module: name of a module to restrict the search 401 402 @type firstbb: STRING 403 @param firstbb: generalized assembler of the first BB (to search function begin) 404 405 @rtype: LIST 406 @return: a list of tuples with possible function's addresses and the heauristic match percentage 407 """ 408 409 #if the first argument is a LIST, decode it to each real argument of the function, following the order in the CSV file. 410 #this give us a simple support for copy 'n paste from the CSV file. 411 if isinstance(search, list): 412 search.reverse() 413 tmp = search[:] 414 if tmp: search = tmp.pop() 415 if tmp: functionhash = tmp.pop() 416 if tmp: firstcallhash = tmp.pop() 417 if tmp: exact = tmp.pop() 418 if tmp: version = tmp.pop() 419 if tmp: file = tmp.pop() 420 if tmp: firstbb = tmp.pop() 421 422 #this arguments are mandatory 423 if not search or not functionhash: 424 return None 425 426 if not firstcallhash: 427 firstcallhash = "" 428 429 heu_addy = None 430 heu_perc = 0 431 poss_functions = [] 432 poss_return = [] 433 search = string.replace(search, "\\n","\n") 434 if search: 435 if module: 436 #XXX: access directly isn't the best way to do this 437 for key,mod in debugger.Getallmodules().iteritems(): 438 if module.lower() in key.lower(): 439 poss_functions += self.imm.searchCommandsOnModule(mod[0], search) 440 else: 441 poss_functions = self.imm.searchCommands(search) 442 if poss_functions: 443 for poss in poss_functions: 444 #self.imm.Log("possible funct: %08X" % poss[0]) 445 addy = self.imm.getFunctionBegin(poss[0]) 446 if not addy: 447 #check entrypoint routine 448 for mod in self.imm.getAllModules().values(): 449 if mod.getMainentry(): 450 #self.imm.Log("mainentry: %08X" % mod.getMainentry()) 451 f = StackFunction(self.imm, mod.getMainentry()) 452 if f.isInsideFunction(poss[0]): 453 addy = mod.getMainentry() 454 break 455 if not addy and firstbb: 456