Nicolas Bernard's Toolbox - Bloom Filters

Download: bloom.h bloom.c

bloom.h

  1: /*
  2:  * Copyright 2005-2009 Nicolas Bernard <http://www.lafraze.net/nbernard/>
  3:  *
  4:  * Permission to use, copy, modify, and distribute this software for any
  5:  * purpose with or without fee is hereby granted, provided that the above
  6:  * copyright notice and this permission notice appear in all copies.
  7:  *
  8:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15:  */
 16: 
 17: #ifndef _BLOOM_H_
 18: #define _BLOOM_H_
 19: 
 20: /**
 21:  * newbloom create a new bloom filter.
 22:  * parameters:
 23:  *    first: size of the filter in bits.
 24:  *    second: number of hashes to use. must be in [1, 16]
 25:  *    last: should we use a random salt for this filter?
 26:  *      if set, it is harder for a malicious user to polute
 27:  *      the filter. The drawback is that a same value won't
 28:  *      be hashed in the same value in to different filter.
 29:  * returned value:
 30:  *    an int which is a bloom filter descriptor or a value < 0
 31:  *      in case of error.
 32:  *
 33:  * WARNING: OpenSSL should have been initialized beforehand.
 34: */
 35: extern int newbloom(unsigned long, unsigned int, unsigned int);
 36: 
 37: /**
 38:  * destroybloom destroy a bloom filter.
 39:  * parameters:
 40:  *    int bd, where bd is the descriptor of the filter to destroy
 41:  * returned value:
 42:  *    0 if successful, < 0 otherwise.
 43:  */
 44: extern int destroybloom(int);
 45: 
 46: /**
 47:  * add: add some data in a filter.
 48:  * parameters:
 49:  *    int bd, the descriptor of the filter in which the data will be
 50:  *      registered;
 51:  *    void* data, a buffer with the data;
 52:  *    unsigned long datasize, the size of the data buffer.
 53:  * returned value:
 54:  *    0 if successful, < 0 otherwise.
 55:  */
 56: extern int add(int, const void*, unsigned long);
 57: 
 58: 
 59: /**
 60:  * isin: match some data against a filter.
 61:  * parameters:
 62:  *    int bd, the descriptor of the filter in which the data will be
 63:  *      searched;
 64:  *    void* data, a buffer with the data;
 65:  *    unsigned long datasize, the size of the data buffer.
 66:  * returned value:
 67:  *    0 if the data match the filter, 0 if not, < 0 if an error occured
 68:  */
 69: extern int isin(int, const void*, unsigned long);
 70: 
 71: 
 72: /**
 73:  *
 74:  * The struct blhashes, combined with the bloomhash, addhash and isinbloom
 75:  * functions allow finer grained operations (compared to add and isin).
 76:  *
 77:  * nb: this should be considered as an opaque type.
 78:  *
 79:  */
 80: struct blhashes {
 81:         int bloomid;
 82:         unsigned long int hashes[16];
 83: };
 84: 
 85: /**
 86:  * bloomhash: compute the hash of some data (for the given bloomfilter bd)
 87:  * and return it in a struct blhashes.
 88:  *
 89:  * parameters:
 90:  *    int bd, the descriptor of the filter for which hash will be computed;
 91:  *    struct blhashes* blh, the structure in which the hash will be stored.
 92:  *            if NULL, it will be allocated dynamically.
 93:  *    void* data, the data to be hashed. They are not modified;
 94:  *    unsigned long datasize, the size of the data in octets.
 95:  *
 96:  * returned value:
 97:  */
 98: extern struct blhashes* bloomhash(int bd, struct blhashes* blh, const void* data, unsigned long datasize);
 99: 
100: /**
101:  * addhash: same as add, but for pre-hashed data.
102:  *
103:  * parameters:
104:  *    int bd, the descriptor of the filter in which the data will be
105:  *        registered;
106:  *    struct blhashes *blh, the hash of the data to be registered.
107:  *
108:  * returned value:
109:  *    0 if all is well;
110:  *    <0 if an error occures (consult sources or guru).
111:  */
112: extern int addhash(int bd, const struct blhashes *blh);
113: 
114: /**
115:  * isinbloom: takes a bloom descriptor and a struct blhashes and tell whether
116:  *     the corresponding data are already in the bloom filter or not.
117:  *
118:  * parameters:
119:  *     int bd, the bloom filter descriptor to search;
120:  *     struct blhashes *blh, the hash of the data to search for.
121:  *
122:  * returned value:
123:  *     < 0 in case of error;
124:  *     0 if there is no match;
125:  *     1 if there is a match (remember, this is probabilistic!).
126:  */
127: extern int isinbloom(int bd, const struct blhashes *blh);
128: 
129: /**
130:  * displayhash: (for debug purpose only): displays the content of a
131:  *   struct blhashes.
132:  * parameters:
133:  *    struct blhashes *blh, the structure to display;
134:  *    FILE* out, the stream on which the data have to be sent.
135:  */
136: extern void displayhash(struct blhashes *blh, FILE* out);
137: 
138: #ifdef WITH_DATALOC
139: 
140: #include "dataloc.h"
141: 
142: extern int bloompack(struct dataloc *loc, int bd);
143: extern int bloomunpack(struct dataloc *loc);
144: 
145: #endif /* WITH_DATALOC */
146: 
147: #endif /* _BLOOM_H_ */

bloom.c

  1: /*
  2:  * Copyright 2005-2009 Nicolas Bernard <http://www.lafraze.net/nbernard/>
  3:  *
  4:  * Permission to use, copy, modify, and distribute this software for any
  5:  * purpose with or without fee is hereby granted, provided that the above
  6:  * copyright notice and this permission notice appear in all copies.
  7:  *
  8:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15:  */
 16: 
 17: /* sha2 requires openssl >= 0.9.8 */
 18: /* sha1 is currently used */
 19: 
 20: #include <openssl/rand.h>
 21: #include <openssl/bn.h>
 22: #include <openssl/dh.h>
 23: #include <openssl/sha.h>
 24: #include <assert.h>
 25: #include <errno.h>
 26: #include <stdlib.h>
 27: #include <string.h>
 28: #include <unistd.h>
 29: 
 30: #include "bloom.h"
 31: #ifdef WITH_DATALOC
 32: #include "dataloc.h"
 33: #endif
 34: #include "debug.h"
 35: 
 36: #define BLOOMSINITSIZE 100
 37: #define SALTSIZE 64
 38: 
 39: struct bloom_internal {
 40:         unsigned long size;
 41:         unsigned char *filter;
 42:         unsigned int nbhashes;
 43:         unsigned char *salt;
 44: };
 45: 
 46: static int initialized = 0;
 47: static unsigned int bloomssize = 0;
 48: static struct bloom_internal **blooms = NULL;
 49: 
 50: static unsigned char*
 51: openssl_prandbuf(unsigned int size)
 52: {
 53:         unsigned char* buf = NULL;
 54:         BIGNUM* rnd = BN_new();
 55:         if (rnd == NULL) return NULL;
 56: 
 57:         buf = (unsigned char*) malloc(size);
 58:         if (buf == NULL) {BN_free(rnd); return NULL;}
 59: 
 60:         if (!BN_rand(rnd, size * 8, -1, 0)) {
 61:                 free(buf);
 62:                 return NULL;
 63:         }
 64: 
 65:         BN_bn2bin(rnd, buf);
 66:         BN_free(rnd);
 67:         return buf;
 68: }
 69: 
 70: static int
 71: existbloom(int f)
 72: {
 73:         if (!initialized)
 74:                 return 0;
 75:         if (f < 0)
 76:                 return 0;
 77:         if ((unsigned int) f >= bloomssize)
 78:                 return 0;
 79:         if (blooms[f] == NULL)
 80:                 return 0;
 81:         return 1;
 82: }
 83: 
 84: static int
 85: initblooms(void)
 86: {
 87:         if (initialized || bloomssize || blooms != NULL)
 88:                 return -1;
 89:         blooms = (struct bloom_internal**)
 90:                 calloc(BLOOMSINITSIZE, sizeof(struct bloom_internal*));
 91:         if (blooms == NULL)
 92:                 return -1;
 93:         bloomssize = BLOOMSINITSIZE;
 94:         initialized = 1;
 95:         return 0;
 96: }
 97: 
 98: static int
 99: resizeblooms(void)
100: {
101:         unsigned int i = 0;
102:         struct bloom_internal** blooms_tmp = NULL;
103:         if (!initialized || !bloomssize || blooms == NULL)
104:                 return -1;
105:         blooms_tmp = (struct bloom_internal**)
106:                 realloc(blooms,
107:                         2 * bloomssize * sizeof(struct bloom_internal*));
108:         if (blooms_tmp == NULL)
109:                 return -1;
110:         blooms = blooms_tmp;
111:         for (i = bloomssize; i < 2 * bloomssize; i++)
112:                 blooms[i] = NULL;
113:         bloomssize *= 2;
114:         return 0;
115: }
116: 
117: int
118: newbloom(unsigned long bsize, unsigned int nbhashes, unsigned int salted)
119: {
120:         unsigned short i = 0;
121:         if (!initialized)
122:                 if (initblooms())
123:                         return -1;
124:         for (i = 0; i < bloomssize; i++)
125:                 if (blooms[i] == NULL)
126:                         break;
127:         if (i == bloomssize) {
128:                 if (resizeblooms())
129:                         return -1;
130:         }
131:         if (salted != 0 && salted != 1)
132:                 return -7;
133:         if (nbhashes < 1 || nbhashes > 16)
134:                 return -7;
135:         {        /* we can now add a bloom at blooms[i] */
136:                 unsigned int j = 0;
137:                 unsigned long size = 0xDeadBeef;
138:                 unsigned char *tmp = (unsigned char*) 0xDeadBeef;
139:                 unsigned char *tmpsalt = NULL;
140:                 struct bloom_internal* tmpb =
141:                         (struct bloom_internal*) 0xDeadBeef;
142:                 size = bsize / 8;
143:                 if (bsize % 8)
144:                         size++;
145:                 assert(sizeof(unsigned char) == 1);
146:                 tmp = (unsigned char*) malloc(size);
147:                 if (tmp == NULL)
148:                         return -3;
149:                 tmpb = (struct bloom_internal*)
150:                         malloc(sizeof(struct bloom_internal));
151:                 if (tmpb == NULL) {
152:                         free(tmp);
153:                         return -3;
154:                 }
155:                 if (salted != 0) {
156:                         tmpsalt = openssl_prandbuf(SALTSIZE);
157:                         if (tmpsalt == NULL) {
158:                                 free(tmp);
159:                                 free(tmpb);
160:                                 return -3;
161:                         }
162:                 }
163: 
164:                 for (j = 0; j < size; j++)
165:                         tmp[j] = 0;
166:                 tmpb->filter = tmp;
167:                 tmpb->size = size;
168:                 tmpb->nbhashes = nbhashes;
169:                 tmpb->salt = tmpsalt;
170:                 blooms[i] = tmpb;
171:         }
172:         return (int) i;
173: }
174: 
175: int
176: destroybloom(int bd)
177: {
178:         if (!existbloom(bd))
179:                 return -2;
180:         {         /* destroy blooms[bd] */
181:                 struct bloom_internal* tmp = blooms[bd];
182:                 blooms[bd] = NULL;
183:                 free(tmp->filter);
184:                 if (tmp->salt != NULL) {
185:                         memset(tmp->salt, 0, 16);
186:                         free(tmp->salt);
187:                 }
188:                 free(tmp);
189:         }
190:         return 0;
191: }
192: 
193: static inline unsigned char
194: mask(unsigned int shift)
195: {
196:         assert(shift < 8);
197:         switch(shift) {
198:         case 0: return 1;
199:         case 1: return 2;
200:         case 2: return 4;
201:         case 3: return 8;
202:         case 4: return 16;
203:         case 5: return 32;
204:         case 6: return 64;
205:         case 7: return 128;
206:         }
207:         panic("aborting, shift: %u", shift);
208: }
209: 
210: //static const unsigned char mask[] = {1, 2, 4, 8, 16, 32, 64, 128};
211: 
212: static inline int
213: touchpos(int bd, unsigned long int pos)
214: {
215:         unsigned long part = pos / 8;
216:         unsigned int shift = pos % 8;
217:         if (blooms[bd]->size <= part)
218:                 return -4;
219:         blooms[bd]->filter[part] |= mask(shift);
220:         return 0;
221: }
222: 
223: static inline int
224: issetpos(int bd, unsigned long int pos)
225: {
226:         if (!existbloom(bd))
227:                 return -1;
228:         {
229:                 unsigned long part = pos / 8;
230:                 unsigned int shift = pos % 8;
231:                 if (blooms[bd]->size <= part)
232:                         return -4;
233:                 if (blooms[bd]->filter[part] & mask(shift))
234:                         return 1;
235:         }
236:         return 0;
237: }
238: 
239: void
240: displayhash(struct blhashes *blh, FILE* out)
241: {
242:         assert(blh != NULL);
243:         assert(out != NULL);
244:         fprintf(out, "[%d][%lX:%lX:%lX:%lX:%lX:%lX:%lX:%lX:%lX:%lX:%lX:%lX:"
245:                 "%lX:%lX:%lX:%lX]\n", blh->bloomid,
246:                 blh->hashes[0], blh->hashes[1], blh->hashes[2], blh->hashes[3],
247:                 blh->hashes[4], blh->hashes[5], blh->hashes[6], blh->hashes[7],
248:                 blh->hashes[8], blh->hashes[9], blh->hashes[10],
249:                 blh->hashes[11], blh->hashes[12], blh->hashes[13],
250:                 blh->hashes[14], blh->hashes[15]);
251: }
252: 
253: static int
254: hash(int bd, unsigned long int hashes[16], const void* data,
255:      unsigned long datasize)
256: {
257:         unsigned char hash[64]; /* Flawfinder: ignore */
258:         unsigned char tmp[20+SALTSIZE]; /* Flawfinder: ignore */
259:         assert(blooms[bd]->nbhashes >= 1 && blooms[bd]->nbhashes < 17);
260:         if (data == NULL || datasize == 0)
261:                 return -1;
262: /*        SHA512(data, datasize, hash); */
263:         SHA1(data, datasize, tmp + SALTSIZE);
264:         if (blooms[bd]->salt != NULL) {
265:                 /* Flawfinder: ignore */
266:                 memcpy(tmp, blooms[bd]->salt, SALTSIZE);
267:                 SHA1(tmp, SALTSIZE+20, hash);
268:         } else
269:                 SHA1(tmp + SALTSIZE, 20, hash);
270:         SHA1(hash, 20, hash + 20);
271:         SHA1(hash + 20, 20, hash + 40);
272:         SHA1(hash + 10, 20, hash + 44);
273: 
274: #if 0
275:         /* verify assumption on data sizes */
276:         assert(sizeof(unsigned long int) * 16 == sizeof hash);
277:         /* Flawfinder: ignore */
278:         memcpy(hashes, hash, sizeof hash);
279: #else
280:         /* this works even if unsigned long int are more than 32 bits long.
281:            While they will then be only half initialized, this should not be
282:            an issue, unless we have bloom filters bigger than 4Gio. */
283:         for (int i = 0; i < 16; i++) {
284:                 hashes[i] = 0;
285:                 hashes[i] = ((uint32_t*) hash)[i];
286:         }
287: #endif
288: 
289:         return 0;
290: }
291: 
292: struct blhashes*
293: bloomhash(int bd, struct blhashes* blh, const void* data, unsigned long datasize)
294: {
295:         if (!existbloom(bd))
296:                 return NULL;
297:         if (blh == NULL)
298:                 blh = (struct blhashes*) calloc(1, sizeof(struct blhashes));
299:         if (blh == NULL)
300:                 return NULL;
301:         blh->bloomid = bd;
302:         if (hash(bd, blh->hashes, data, datasize) == 0)
303:                 return blh;
304:         return NULL;
305: }
306: 
307: int
308: add(int bd, const void* data, unsigned long datasize)
309: {
310:         unsigned int i = 0;
311:         unsigned long int hashes[16];
312:         if (!existbloom(bd))
313:                 return -1;
314:         if (hash(bd, hashes, data, datasize) != 0)
315:                 return -5;
316:         for (i = 0; i < blooms[bd]->nbhashes; i++) {
317:                 int ret = touchpos(bd, hashes[i] % (blooms[bd]->size * 8));
318:                 if (ret != 0)
319:                         return ret;
320:         }
321:         return 0;
322: }
323: 
324: int
325: addhash(int bd, const struct blhashes *blh)
326: {
327:         unsigned int i = 0;
328:         assert(blh != NULL);
329:         if (!existbloom(bd))
330:                 return -1;
331:         if (blh->bloomid != bd)
332:                 return -2;
333:         for (i = 0; i < blooms[bd]->nbhashes; i++) {
334:                 int ret = touchpos(bd, blh->hashes[i] % (blooms[bd]->size * 8));
335:                 if (ret != 0)
336:                         return ret;
337:         }
338:         return 0;
339: }
340: 
341: int
342: isin(int bd, const void* data, unsigned long datasize)
343: {
344:         unsigned int i = 0;
345:         unsigned long int hashes[16];
346:         if (!existbloom(bd)) {
347:                 stkdbg("Invalid bloom descriptor! (%d)", bd);
348:                 return -1;
349:         }
350:         if (hash(bd, hashes, data, datasize) != 0) {
351:                 stkdbg("bloom hash failed");
352:                 return -5;
353:         }
354:         for (i = 0; i < blooms[bd]->nbhashes; i++) {
355:                 int ret = issetpos(bd, hashes[i] % (blooms[bd]->size * 8));
356:                 if (ret != 1)
357:                         return ret;
358:         }
359:         return 1;
360: }
361: 
362: int
363: isinbloom(int bd, const struct blhashes *blh)
364: {
365:         unsigned int i = 0;
366:         assert(blh != NULL);
367:         if (!existbloom(bd)) {
368:                 stkdbg("Invalid bloom descriptor! (%d)", bd);
369:                 return -1;
370:         }
371:         if (blh->bloomid != bd) {
372:                 stkdbg("Error, comparison with data of another hash! (%d, %d)",
373:                        blh->bloomid, bd);
374:                 return -2;
375:         }
376:         for (i = 0; i < blooms[bd]->nbhashes; i++) {
377:                 int ret = issetpos(bd, blh->hashes[i] % (blooms[bd]->size * 8));
378:                 if (ret != 1)
379:                         return ret;
380:         }
381:         return 1;
382: }
383: 
384: #ifdef WITH_DATALOC
385: 
386: int
387: bloompack(struct dataloc *loc, int bd)
388: {
389:         uint8_t isdata = 1;
390:         if (!existbloom(bd)) {
391:                 isdata = 0;
392:         }
393: 
394:         int err = writeloc(loc, &isdata, sizeof isdata);
395:         if (err < 0) {
396:                 debug("writeloc: %d %s", err,
397:                       (err == -2 || err == -3 || err == -20) ?
398:                       strerror(errno) : "");
399:                 return -10;
400:         }
401:         if (!isdata)
402:                 return 0;
403: 
404:         debug("will write filter of size %lu", blooms[bd]->size);
405:         err = writeloc(loc, &blooms[bd]->size, sizeof blooms[bd]->size);
406:         if (err < 0) {
407:                 debug("writeloc: %d %s", err,
408:                       (err == -2 || err == -3 || err == -20) ?
409:                       strerror(errno) : "");
410:                 return -20;
411:         }
412:         err = writeloc(loc, blooms[bd]->filter, blooms[bd]->size);
413:         if (err < 0) {
414:                 debug("writeloc: %d %s", err,
415:                       (err == -2 || err == -3 || err == -20) ?
416:                       strerror(errno) : "");
417:                 return -30;
418:         }
419:         err = writeloc(loc, &blooms[bd]->nbhashes, sizeof blooms[bd]->nbhashes);
420:         if (err < 0) {
421:                 debug("writeloc: %d %s", err,
422:                       (err == -2 || err == -3 || err == -20) ?
423:                       strerror(errno) : "");
424:                 return -40;
425:         }
426:         err = writeloc(loc, blooms[bd]->salt, SALTSIZE);
427:         if (err < 0) {
428:                 debug("writeloc: %d %s", err,
429:                       (err == -2 || err == -3 || err == -20) ?
430:                       strerror(errno) : "");
431:                 return -50;
432:         }
433:         return 0;
434: }
435: 
436: int
437: bloomunpack(struct dataloc *loc)
438: {
439:         int err = 0;
440:         unsigned int i = 0;
441:         if (!initialized)
442:                 if (initblooms())
443:                         return -1;
444: 
445:         uint8_t isdata = 123;
446:         err = readloc(loc, &isdata, sizeof isdata);
447:         if (err < 0)
448:                 return -3;
449:         if (isdata == 0)
450:                 return 0;
451:         if (isdata != 1)
452:                 return -5;
453: 
454:         for (i = 0; i < bloomssize; i++)
455:                 if (blooms[i] == NULL)
456:                         break;
457:         if (i == bloomssize) {
458:                 if (resizeblooms())
459:                         return -10;
460:         }
461:         struct bloom_internal* tmpb =
462:                         (struct bloom_internal*) 0xDeadBeef;
463:         tmpb = (struct bloom_internal*)
464:                         malloc(sizeof(struct bloom_internal));
465:         if (tmpb == NULL)
466:                 return -20;
467: 
468:         err = readloc(loc, &tmpb->size, sizeof tmpb->size);
469:         if (err < 0) {
470:                 free(tmpb);
471:                 return -30;
472:         }
473:         debug("will read filter of size %lu", tmpb->size);
474:         tmpb->filter = (unsigned char*) malloc(tmpb->size);
475:         if (tmpb->filter == NULL) {
476:                 free(tmpb);
477:                 return -40;
478:         }
479:         tmpb->salt = (unsigned char*) malloc(SALTSIZE);
480:         if (tmpb->salt == NULL) {
481:                 free(tmpb->filter);
482:                 free(tmpb);
483:                 return -45;
484:         }
485: 
486:         err = readloc(loc, tmpb->filter, tmpb->size);
487:         if (err < 0) {
488:                 free(tmpb->filter);
489:                 free(tmpb->salt);
490:                 free(tmpb);
491:                 return -50;
492:         }
493: 
494:         err = readloc(loc, &tmpb->nbhashes, sizeof tmpb->nbhashes);
495:         if (err < 0) {
496:                 free(tmpb->filter);
497:                 free(tmpb->salt);
498:                 free(tmpb);
499:                 return -60;
500:         }
501: 
502:         err = readloc(loc, tmpb->salt, SALTSIZE);
503:         if (err < 0) {
504:                 free(tmpb->filter);
505:                 free(tmpb->salt);
506:                 free(tmpb);
507:                 return -70;
508:         }
509:         blooms[i] = tmpb;
510:         return (int) i;
511: }
512: 
513: #endif /* WITH_DATALOC */