librsync  2.3.4
sumset.c
1/*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2 *
3 * librsync -- library for network deltas
4 *
5 * Copyright (C) 1999, 2000, 2001 by Martin Pool <mbp@sourcefrog.net>
6 * Copyright (C) 1999 by Andrew Tridgell
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as published by
10 * the Free Software Foundation; either version 2.1 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23#include "config.h" /* IWYU pragma: keep */
24#include <stdlib.h>
25#include <string.h>
26#include "librsync.h"
27#include "sumset.h"
28#include "trace.h"
29#include "util.h"
30
31static void rs_block_sig_init(rs_block_sig_t *sig, rs_weak_sum_t weak_sum,
32 rs_strong_sum_t *strong_sum, int strong_len)
33{
34 sig->weak_sum = weak_sum;
35 if (strong_sum)
36 memcpy(sig->strong_sum, strong_sum, (size_t)strong_len);
37}
38
39static inline unsigned rs_block_sig_hash(const rs_block_sig_t *sig)
40{
41 return (unsigned)sig->weak_sum;
42}
43
44typedef struct rs_block_match {
45 rs_block_sig_t block_sig;
46 rs_signature_t *signature;
47 const void *buf;
48 size_t len;
50
51static void rs_block_match_init(rs_block_match_t *match, rs_signature_t *sig,
52 rs_weak_sum_t weak_sum,
53 rs_strong_sum_t *strong_sum, const void *buf,
54 size_t len)
55{
56 rs_block_sig_init(&match->block_sig, weak_sum, strong_sum,
57 sig->strong_sum_len);
58 match->signature = sig;
59 match->buf = buf;
60 match->len = len;
61}
62
63static inline int rs_block_match_cmp(rs_block_match_t *match,
64 const rs_block_sig_t *block_sig)
65{
66 /* If buf is not NULL, the strong sum is yet to be calculated. */
67 if (match->buf) {
68#ifndef HASHTABLE_NSTATS
69 match->signature->calc_strong_count++;
70#endif
71 rs_signature_calc_strong_sum(match->signature, match->buf, match->len,
72 &(match->block_sig.strong_sum));
73 match->buf = NULL;
74 }
75 return memcmp(&match->block_sig.strong_sum, &block_sig->strong_sum,
76 (size_t)match->signature->strong_sum_len);
77}
78
79/* Disable mix32() in the hashtable because RabinKarp doesn't need it. We
80 manually apply mix32() to rollsums before using them in the hashtable. */
81#define HASHTABLE_NMIX32
82/* Instantiate hashtable for rs_block_sig and rs_block_match. */
83#define ENTRY rs_block_sig
84#define MATCH rs_block_match
85#define NAME hashtable
86#include "hashtable.h"
87
88/* Get the size of a packed rs_block_sig_t. */
89static inline size_t rs_block_sig_size(const rs_signature_t *sig)
90{
91 /* Round up to multiple of sizeof(weak_sum) to align memory correctly. */
92 const size_t mask = sizeof(rs_weak_sum_t)- 1;
93 return (offsetof(rs_block_sig_t, strong_sum) +
94 (((size_t)sig->strong_sum_len + mask) & ~mask));
95}
96
97/* Get the pointer to the block_sig_t from a block index. */
98static inline rs_block_sig_t *rs_block_sig_ptr(const rs_signature_t *sig,
99 int block_idx)
100{
101 return (rs_block_sig_t *)((char *)sig->block_sigs +
102 block_idx * rs_block_sig_size(sig));
103}
104
105/* Get the index of a block from a block_sig_t pointer. */
106static inline int rs_block_sig_idx(const rs_signature_t *sig,
107 rs_block_sig_t *block_sig)
108{
109 return (int)(((char *)block_sig -
110 (char *)sig->block_sigs) / rs_block_sig_size(sig));
111}
112
113rs_result rs_sig_args(rs_long_t old_fsize, rs_magic_number * magic,
114 size_t *block_len, size_t *strong_len)
115{
116 size_t rec_block_len; /* the recomended block_len for the given
117 old_fsize. */
118 size_t min_strong_len; /* the minimum strong_len for the given
119 old_fsize and block_len. */
120 size_t max_strong_len; /* the maximum strong_len for the given magic. */
121
122 /* Check and set default arguments. */
123 *magic = *magic ? *magic : RS_RK_BLAKE2_SIG_MAGIC;
124 switch (*magic) {
127 max_strong_len = RS_BLAKE2_SUM_LENGTH;
128 break;
129 case RS_MD4_SIG_MAGIC:
131 max_strong_len = RS_MD4_SUM_LENGTH;
132 break;
133 default:
134 rs_error("invalid magic %#x", *magic);
135 return RS_BAD_MAGIC;
136 }
137 /* The recommended block_len is sqrt(old_fsize) with a 256 min size rounded
138 down to a multiple of the 128 byte blake2b blocksize to give a
139 reasonable compromise between signature size, delta size, and
140 performance. If the old_fsize is unknown, we use the default. */
141 if (old_fsize < 0) {
142 rec_block_len = RS_DEFAULT_BLOCK_LEN;
143 } else {
144 rec_block_len =
145 old_fsize <= 256 * 256 ? 256 : rs_long_sqrt(old_fsize) & ~127;
146 }
147 if (*block_len == 0)
148 *block_len = rec_block_len;
149 /* The recommended strong_len assumes the worst case new_fsize = old_fsize
150 + 16MB with no matches. This results in comparing a block at every byte
151 offset against all the blocks in the signature, or new_fsize*block_num
152 comparisons. With N bits in the blocksig, there is a 1/2^N chance per
153 comparison of a hash colision. So with 2^N attempts there would be a
154 fair chance of having a collision. So we want to round up to the next
155 byte, add an extra 2 bytes (16 bits) in the strongsum, and assume the
156 weaksum is worth another 16 bits, for at least 32 bits extra, giving a
157 worst case 1/2^32 chance of having a hash collision per delta. If
158 old_fsize is unknown we use a conservative default. */
159 if (old_fsize < 0) {
160 min_strong_len = RS_DEFAULT_MIN_STRONG_LEN;
161 } else {
162 min_strong_len =
163 2 + (rs_long_ln2(old_fsize + ((rs_long_t)1 << 24)) +
164 rs_long_ln2(old_fsize / *block_len + 1) + 7) / 8;
165 }
166 if (*strong_len == 0)
167 *strong_len = max_strong_len;
168 else if (*strong_len == -1)
169 *strong_len = min_strong_len;
170 else if (old_fsize >= 0 && *strong_len < min_strong_len) {
171 rs_warn("strong_len=" FMT_SIZE " smaller than recommended minimum "
172 FMT_SIZE " for old_fsize=" FMT_LONG " with block_len=" FMT_SIZE,
173 *strong_len, min_strong_len, old_fsize, *block_len);
174 } else if (*strong_len > max_strong_len) {
175 rs_error("invalid strong_len=" FMT_SIZE " for magic=%#x", *strong_len,
176 (int)*magic);
177 return RS_PARAM_ERROR;
178 }
179 rs_sig_args_check(*magic, *block_len, *strong_len);
180 return RS_DONE;
181}
182
184 size_t block_len, size_t strong_len,
185 rs_long_t sig_fsize)
186{
187 rs_result result;
188
189 /* Check and set default arguments, using old_fsize=-1 for unknown. */
190 if ((result = rs_sig_args(-1, &magic, &block_len, &strong_len)) != RS_DONE)
191 return result;
192 /* Set attributes from args. */
193 sig->magic = magic;
194 sig->block_len = (int)block_len;
195 sig->strong_sum_len = (int)strong_len;
196 sig->count = 0;
197 /* Calculate the number of blocks if we have the signature file size. */
198 /* Magic+header is 12 bytes, each block thereafter is 4 bytes
199 weak_sum+strong_sum_len bytes */
200 sig->size = (int)(sig_fsize < 12 ? 0 : (sig_fsize - 12) / (4 + strong_len));
201 if (sig->size)
202 sig->block_sigs =
203 rs_alloc(sig->size * rs_block_sig_size(sig),
204 "signature->block_sigs");
205 else
206 sig->block_sigs = NULL;
207 sig->hashtable = NULL;
208#ifndef HASHTABLE_NSTATS
209 sig->calc_strong_count = 0;
210#endif
212 return RS_DONE;
213}
214
216{
217 hashtable_free(sig->hashtable);
218 free(sig->block_sigs);
219 rs_bzero(sig, sizeof(*sig));
220}
221
223 rs_weak_sum_t weak_sum,
224 rs_strong_sum_t *strong_sum)
225{
227 /* Apply mix32() to rollsum weaksums to improve their distribution. */
228 if (rs_signature_weaksum_kind(sig) == RS_ROLLSUM)
229 weak_sum = mix32(weak_sum);
230 /* If block_sigs is full, allocate more space. */
231 if (sig->count == sig->size) {
232 sig->size = sig->size ? sig->size * 2 : 16;
233 sig->block_sigs =
234 rs_realloc(sig->block_sigs, sig->size * rs_block_sig_size(sig),
235 "signature->block_sigs");
236 }
237 rs_block_sig_t *b = rs_block_sig_ptr(sig, sig->count++);
238 rs_block_sig_init(b, weak_sum, strong_sum, sig->strong_sum_len);
239 return b;
240}
241
242rs_long_t rs_signature_find_match(rs_signature_t *sig, rs_weak_sum_t weak_sum,
243 void const *buf, size_t len)
244{
247
249 rs_block_match_init(&m, sig, weak_sum, NULL, buf, len);
250 if ((b = hashtable_find(sig->hashtable, &m))) {
251 return (rs_long_t)rs_block_sig_idx(sig, b) * sig->block_len;
252 }
253 return -1;
254}
255
257{
258#ifndef HASHTABLE_NSTATS
259 hashtable_t *t = sig->hashtable;
260
261 rs_log(RS_LOG_INFO | RS_LOG_NONAME,
262 "match statistics: signature[%ld searches, %ld (%.3f%%) matches, "
263 "%ld (%.3fx) weak sum compares, %ld (%.3f%%) strong sum compares, "
264 "%ld (%.3f%%) strong sum calcs]", t->find_count, t->match_count,
265 100.0 * (double)t->match_count / (double)t->find_count,
266 t->hashcmp_count, (double)t->hashcmp_count / (double)t->find_count,
268 100.0 * (double)t->entrycmp_count / (double)t->find_count,
270 100.0 * (double)sig->calc_strong_count / (double)t->find_count);
271#endif
272}
273
275{
278 int i;
279
281 sig->hashtable = hashtable_new(sig->count);
282 if (!sig->hashtable)
283 return RS_MEM_ERROR;
284 for (i = 0; i < sig->count; i++) {
285 b = rs_block_sig_ptr(sig, i);
286 rs_block_match_init(&m, sig, b->weak_sum, &b->strong_sum, NULL, 0);
287 if (!hashtable_find(sig->hashtable, &m))
288 hashtable_add(sig->hashtable, b);
289 }
290 hashtable_stats_init(sig->hashtable);
291 return RS_DONE;
292}
293
295{
296 rs_signature_done(psums);
297 free(psums);
298}
299
301{
302 int i;
304 char strong_hex[RS_MAX_STRONG_SUM_LENGTH * 3];
305
306 rs_log(RS_LOG_INFO | RS_LOG_NONAME,
307 "sumset info: magic=%#x, block_len=%d, block_num=%d", sums->magic,
308 sums->block_len, sums->count);
309
310 for (i = 0; i < sums->count; i++) {
311 b = rs_block_sig_ptr(sums, i);
312 rs_hexify(strong_hex, b->strong_sum, sums->strong_sum_len);
313 rs_log(RS_LOG_INFO | RS_LOG_NONAME,
314 "sum %6d: weak=" FMT_WEAKSUM ", strong=%s", i, b->weak_sum,
315 strong_hex);
316 }
317}
A generic open addressing hashtable.
static unsigned mix32(unsigned h)
MurmurHash3 finalization mix function.
Definition: hashtable.h:170
Public header for librsync.
LIBRSYNC_EXPORT rs_result rs_build_hash_table(rs_signature_t *sums)
Call this after loading a signature to index it.
Definition: sumset.c:274
LIBRSYNC_EXPORT void rs_signature_log_stats(rs_signature_t const *sig)
Log the rs_signature_delta match stats.
Definition: sumset.c:256
LIBRSYNC_EXPORT rs_result rs_sig_args(rs_long_t old_fsize, rs_magic_number *magic, size_t *block_len, size_t *strong_len)
Get or check signature arguments for a given file size.
Definition: sumset.c:113
LIBRSYNC_EXPORT void rs_free_sumset(rs_signature_t *)
Deep deallocation of checksums.
Definition: sumset.c:294
rs_result
Return codes from nonblocking rsync operations.
Definition: librsync.h:180
@ RS_MEM_ERROR
Out of memory.
Definition: librsync.h:189
@ RS_DONE
Completed successfully.
Definition: librsync.h:181
@ RS_PARAM_ERROR
Bad value passed in to library, probably an application bug.
Definition: librsync.h:200
@ RS_BAD_MAGIC
Bad magic number at start of stream.
Definition: librsync.h:193
#define RS_DEFAULT_BLOCK_LEN
Default block length, if not determined by any other factors.
Definition: librsync.h:367
@ RS_LOG_INFO
Informational.
Definition: librsync.h:125
rs_magic_number
A uint32 magic number, emitted in bigendian/network order at the start of librsync files.
Definition: librsync.h:65
@ RS_BLAKE2_SIG_MAGIC
A signature file using the BLAKE2 hash.
Definition: librsync.h:89
@ RS_MD4_SIG_MAGIC
A signature file with MD4 signatures.
Definition: librsync.h:82
@ RS_RK_BLAKE2_SIG_MAGIC
A signature file with RabinKarp rollsum and BLAKE2 hash.
Definition: librsync.h:109
@ RS_RK_MD4_SIG_MAGIC
A signature file with RabinKarp rollsum and MD4 hash.
Definition: librsync.h:99
#define RS_DEFAULT_MIN_STRONG_LEN
Default minimum strong sum length, if the filesize is unknown.
Definition: librsync.h:373
LIBRSYNC_EXPORT void rs_sumset_dump(rs_signature_t const *)
Dump signatures to the log.
Definition: sumset.c:300
LIBRSYNC_EXPORT void rs_hexify(char *to_buf, void const *from_buf, int from_len)
Convert from_len bytes at from_buf into a hex representation in to_buf, which must be twice as long p...
Definition: hex.c:23
The hashtable type.
Definition: hashtable.h:128
long find_count
The count of finds tried.
Definition: hashtable.h:137
long match_count
The count of matches found.
Definition: hashtable.h:138
long entrycmp_count
The count of entry compares done.
Definition: hashtable.h:140
long hashcmp_count
The count of hash compares done.
Definition: hashtable.h:139
Signature of a single block.
Definition: sumset.h:35
rs_strong_sum_t strong_sum
Block's strong checksum.
Definition: sumset.h:37
rs_weak_sum_t weak_sum
Block's weak checksum.
Definition: sumset.h:36
Signature of a whole file.
Definition: sumset.h:44
int count
Total number of blocks.
Definition: sumset.h:48
int size
Total number of blocks allocated.
Definition: sumset.h:49
int magic
The signature magic value.
Definition: sumset.h:45
hashtable_t * hashtable
The hashtable for finding matches.
Definition: sumset.h:51
int block_len
The block length.
Definition: sumset.h:46
void * block_sigs
The packed block_sigs for all blocks.
Definition: sumset.h:50
long calc_strong_count
The count of strongsum calcs done.
Definition: sumset.h:54
int strong_sum_len
The block strong sum length.
Definition: sumset.h:47
The rs_signature class implementation of a file signature.
static void rs_signature_calc_strong_sum(rs_signature_t const *sig, void const *buf, size_t len, rs_strong_sum_t *sum)
Calculate the strong sum of a buffer.
Definition: sumset.h:136
#define rs_sig_args_check(magic, block_len, strong_len)
Assert that rs_sig_args() args for rs_signature_init() are valid.
Definition: sumset.h:92
rs_result rs_signature_init(rs_signature_t *sig, rs_magic_number magic, size_t block_len, size_t strong_len, rs_long_t sig_fsize)
Initialize an rs_signature instance.
Definition: sumset.c:183
rs_long_t rs_signature_find_match(rs_signature_t *sig, rs_weak_sum_t weak_sum, void const *buf, size_t len)
Find a matching block offset in a signature.
Definition: sumset.c:242
rs_block_sig_t * rs_signature_add_block(rs_signature_t *sig, rs_weak_sum_t weak_sum, rs_strong_sum_t *strong_sum)
Add a block to an rs_signature instance.
Definition: sumset.c:222
void rs_signature_done(rs_signature_t *sig)
Destroy an rs_signature instance.
Definition: sumset.c:215
static weaksum_kind_t rs_signature_weaksum_kind(rs_signature_t const *sig)
Get the weaksum kind for a signature.
Definition: sumset.h:114
#define rs_signature_check(sig)
Assert that a signature is valid.
Definition: sumset.h:107
logging functions.
@ RS_LOG_NONAME
Don't show function name in message.
Definition: trace.h:87
Misc utility functions used by librsync.