librsync  2.0.1
search.c
1 /*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2  *
3  * librsync -- the library for network deltas
4  *
5  * Copyright (C) 1999, 2000, 2001, 2014 by Martin Pool <[email protected]>
6  * Copyright (C) 1999 by Andrew Tridgell
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation; either version 2.1 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22 
23 /*
24  * This file contains code for searching the sumset for matching
25  * values.
26  */
27 
28 /*
29  * TODO: The common case is that the next block in both streams
30  * match. Can we make that a bit faster at all? We'd need to perhaps
31  * add a link forward between blocks in the sum_struct corresponding
32  * to the order they're found in the input; then before doing a search
33  * we can just check that pointer.
34  */
35 
36 #include "config.h"
37 
38 #include <string.h>
39 #include <stdlib.h>
40 
41 #include "librsync.h"
42 #include "trace.h"
43 #include "util.h"
44 #include "sumset.h"
45 #include "search.h"
46 #include "checksum.h"
47 
48 #define TABLE_SIZE (1<<16)
49 #define NULL_TAG (-1)
50 
51 #define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
52 #define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
53 
54 static void swap(rs_target_t *t1, rs_target_t *t2) {
55  unsigned short ts = t1->t;
56  t1->t = t2->t;
57  t2->t = ts;
58 
59  int ti = t1->i;
60  t1->i = t2->i;
61  t2->i = ti;
62 }
63 
64 static int rs_compare_targets(rs_target_t const *t1, rs_target_t const *t2, rs_signature_t * sums) {
65  int v = (int) t1->t - (int) t2->t;
66  if (v != 0)
67  return v;
68 
69  rs_weak_sum_t w1 = sums->block_sigs[t1->i].weak_sum;
70  rs_weak_sum_t w2 = sums->block_sigs[t2->i].weak_sum;
71 
72  v = (w1 > w2) - (w1 < w2);
73  if (v != 0)
74  return v;
75 
76  return memcmp(sums->block_sigs[t1->i].strong_sum,
77  sums->block_sigs[t2->i].strong_sum,
78  sums->strong_sum_len);
79 }
80 
81 static void heap_sort(rs_signature_t * sums) {
82  unsigned int i, j, n, k, p;
83  for (i = 1; i < sums->count; ++i) {
84  for (j = i; j > 0;) {
85  p = (j - 1) >> 1;
86  if (rs_compare_targets(&sums->targets[j], &sums->targets[p], sums) > 0)
87  swap(&sums->targets[j], &sums->targets[p]);
88  else
89  break;
90  j = p;
91  }
92  }
93 
94  for (n = sums->count - 1; n > 0;) {
95  swap(&sums->targets[0], &sums->targets[n]);
96  --n;
97  for (i = 0; ((i << 1) + 1) <= n;) {
98  k = (i << 1) + 1;
99  if ((k + 1 <= n) && (rs_compare_targets(&sums->targets[k], &sums->targets[k + 1], sums) < 0))
100  k = k + 1;
101  if (rs_compare_targets(&sums->targets[k], &sums->targets[i], sums) > 0)
102  swap(&sums->targets[k], &sums->targets[i]);
103  else
104  break;
105  i = k;
106  }
107  }
108 }
109 
110 rs_result
112 {
113  int i;
114 
115  sums->tag_table = calloc(TABLE_SIZE, sizeof(sums->tag_table[0]));
116  if (!sums->tag_table)
117  return RS_MEM_ERROR;
118 
119  if (sums->count > 0) {
120  sums->targets = calloc(sums->count, sizeof(rs_target_t));
121  if (!sums->targets) {
122  free(sums->tag_table);
123  sums->tag_table = NULL;
124  return RS_MEM_ERROR;
125  }
126 
127  for (i = 0; i < sums->count; i++) {
128  sums->targets[i].i = i;
129  sums->targets[i].t = gettag(sums->block_sigs[i].weak_sum);
130  }
131 
132  heap_sort(sums);
133  }
134 
135  for (i = 0; i < TABLE_SIZE; i++) {
136  sums->tag_table[i].l = NULL_TAG;
137  sums->tag_table[i].r = NULL_TAG;
138  }
139 
140  for (i = sums->count - 1; i >= 0; i--) {
141  sums->tag_table[sums->targets[i].t].l = i;
142  }
143 
144  for (i = 0; i < sums->count; i++) {
145  sums->tag_table[sums->targets[i].t].r = i;
146  }
147 
148  rs_trace("rs_build_hash_table done");
149  return RS_DONE;
150 }
151 
152 
153 
154 /*
155  * See if there is a match for the specified block INBUF..BLOCK_LEN in
156  * the checksum set, using precalculated WEAK_SUM.
157  *
158  * If we don't find a match on the weak checksum, then we just give
159  * up. If we do find a weak match, then we proceed to calculate the
160  * strong checksum for the current block, and see if it will match
161  * anything.
162  */
163 int
164 rs_search_for_block(rs_weak_sum_t weak_sum,
165  const rs_byte_t *inbuf,
166  size_t block_len,
167  rs_signature_t const *sig, rs_stats_t * stats,
168  rs_long_t * match_where)
169 {
170  /* Caller must have called rs_build_hash_table() by now */
171  if (!sig->tag_table)
172  rs_fatal("Must have called rs_build_hash_table() by now");
173 
174  rs_strong_sum_t strong_sum;
175  int got_strong = 0;
176  int hash_tag = gettag(weak_sum);
177  rs_tag_table_entry_t *bucket = &(sig->tag_table[hash_tag]);
178  int l = bucket->l; /* left bound of search region */
179  int r = bucket->r + 1; /* right bound of search region */
180  int v = 1; /* direction of next move: -ve left, +ve right */
181 
182  if (l == NULL_TAG)
183  return 0;
184 
185  while (l < r) {
186  int m = (l + r) >> 1; /* midpoint of search region */
187  int i = sig->targets[m].i;
188  rs_block_sig_t *b = &(sig->block_sigs[i]);
189  v = (weak_sum > b->weak_sum) - (weak_sum < b->weak_sum);
190  // v < 0 - weak_sum < b->weak_sum
191  // v == 0 - weak_sum == b->weak_sum
192  // v > 0 - weak_sum > b->weak_sum
193 
194  if (v == 0) {
195  if (!got_strong) {
196  /* Lazy calculate strong sum after finding weak match. */
197  if(sig->magic == RS_BLAKE2_SIG_MAGIC) {
198  rs_calc_blake2_sum(inbuf, block_len, &strong_sum);
199  } else if (sig->magic == RS_MD4_SIG_MAGIC) {
200  rs_calc_md4_sum(inbuf, block_len, &strong_sum);
201  } else {
202  /* Bad input data is checked in rs_delta_begin, so this
203  * should never be reached. */
204  rs_fatal("Unknown signature algorithm %#x", sig->magic);
205  return 0;
206  }
207  got_strong = 1;
208  }
209  v = memcmp(strong_sum, b->strong_sum, sig->strong_sum_len);
210 
211  if (v == 0) {
212  l = m;
213  r = m;
214  break;
215  }
216  }
217 
218  if (v > 0)
219  l = m + 1;
220  else
221  r = m;
222  }
223 
224  if ((l == r) && (l <= bucket->r)) {
225  int i = sig->targets[l].i;
226  rs_block_sig_t *b = &(sig->block_sigs[i]);
227  if (weak_sum != b->weak_sum)
228  return 0;
229  if (!got_strong) {
230  if(sig->magic == RS_BLAKE2_SIG_MAGIC) {
231  rs_calc_blake2_sum(inbuf, block_len, &strong_sum);
232  } else if (sig->magic == RS_MD4_SIG_MAGIC) {
233  rs_calc_md4_sum(inbuf, block_len, &strong_sum);
234  } else {
235  rs_error("Unknown signature algorithm - this is a BUG");
236  return 0; /* FIXME: Is this the best way to handle this? */
237  }
238  got_strong = 1;
239  }
240  v = memcmp(strong_sum, b->strong_sum, sig->strong_sum_len);
241  int token = b->i;
242  *match_where = (rs_long_t)(token - 1) * sig->block_len;
243  }
244 
245  return !v;
246 }
rs_result rs_build_hash_table(rs_signature_t *sums)
Call this after loading a signature to index it.
Definition: search.c:111
A signature file using the BLAKE2 hash.
Definition: librsync.h:106
Out of memory.
Definition: librsync.h:204
long long rs_long_t
A long integer type that can handle the largest file offsets.
Description of the match described by a signature.
Definition: sumset.h:34
A signature file with MD4 signatures.
Definition: librsync.h:97
Public header for librsync.
Performance statistics from a librsync encoding or decoding operation.
Definition: librsync.h:235
rs_result
Return codes from nonblocking rsync operations.
Definition: librsync.h:192
Definition: sumset.h:41
Completed successfully.
Definition: librsync.h:193