librsync  2.0.1
delta.c
1 /*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2  *
3  * librsync -- library for network deltas
4  *
5  * Copyright (C) 2000, 2001 by Martin Pool <[email protected]>
6  * Copyright (C) 2003 by Donovan Baarda <[email protected]>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation; either version 2.1 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22 
23  /*
24  | Let's climb to the TOP of that
25  | MOUNTAIN and think about STRIP
26  | MINING!!
27  */
28 
29 
30 /*
31  * delta.c -- Generate in streaming mode an rsync delta given a set of
32  * signatures, and a new file.
33  *
34  * The size of blocks for signature generation is determined by the
35  * block size in the incoming signature.
36  *
37  * To calculate a signature, we need to be able to see at least one
38  * block of the new file at a time. Once we have that, we calculate
39  * its weak signature, and see if there is any block in the signature
40  * hash table that has the same weak sum. If there is one, then we
41  * also compute the strong sum of the new block, and cross check that.
42  * If they're the same, then we can assume we have a match.
43  *
44  * The final block of the file has to be handled a little differently,
45  * because it may be a short match. Short blocks in the signature
46  * don't include their length -- we just allow for the final short
47  * block of the file to match any block in the signature, and if they
48  * have the same checksum we assume they must have the same length.
49  * Therefore, when we emit a COPY command, we have to send it with a
50  * length that is the same as the block matched, and not the block
51  * length from the signature.
52  */
53 
54 /*
55  * Profiling results as of v1.26, 2001-03-18:
56  *
57  * If everything matches, then we spend almost all our time in
58  * rs_mdfour64 and rs_weak_sum, which is unavoidable and therefore a
59  * good profile.
60  *
61  * If nothing matches, it is not so good.
62  */
63 
64 
65 #include "config.h"
66 
67 #include <assert.h>
68 #include <stdlib.h>
69 #include <stdio.h>
70 
71 #include "librsync.h"
72 #include "emit.h"
73 #include "stream.h"
74 #include "util.h"
75 #include "sumset.h"
76 #include "job.h"
77 #include "trace.h"
78 #include "search.h"
79 #include "types.h"
80 #include "rollsum.h"
81 
82 const int RS_MD4_SUM_LENGTH = 16;
83 const int RS_BLAKE2_SUM_LENGTH = 32;
84 
85 /**
86  * 2002-06-26: Donovan Baarda
87  *
88  * The following is based entirely on pysync. It is much cleaner than the
89  * previous incarnation of this code. It is slightly complicated because in
90  * this case the output can block, so the main delta loop needs to stop
91  * when this happens.
92  *
93  * In pysync a 'last' attribute is used to hold the last miss or match for
94  * extending if possible. In this code, basis_len and scoop_pos are used
95  * instead of 'last'. When basis_len > 0, last is a match. When basis_len =
96  * 0 and scoop_pos is > 0, last is a miss. When both are 0, last is None
97  * (ie, nothing).
98  *
99  * Pysync is also slightly different in that a 'flush' method is available
100  * to force output of accumulated data. This 'flush' is use to finalise
101  * delta calculation. In librsync input is terminated with an eof flag on
102  * the input stream. I have structured this code similar to pysync with a
103  * seperate flush function that is used when eof is reached. This allows
104  * for a flush style API if one is ever needed. Note that flush in pysync
105  * can be used for more than just terminating delta calculation, so a flush
106  * based API can in some ways be more flexible...
107  *
108  * The input data is first scanned, then processed. Scanning identifies
109  * input data as misses or matches, and emits the instruction stream.
110  * Processing the data consumes it off the input scoop and outputs the
111  * processed miss data into the tube.
112  *
113  * The scoop contains all data yet to be processed. The scoop_pos is an
114  * index into the scoop that indicates the point scanned to. As data is
115  * scanned, scoop_pos is incremented. As data is processed, it is removed
116  * from the scoop and scoop_pos adjusted. Everything gets complicated
117  * because the tube can block. When the tube is blocked, no data can be
118  * processed.
119  *
120  */
121 
122 /* used by rdiff, but now redundant */
123 int rs_roll_paranoia = 0;
124 
125 static rs_result rs_delta_s_scan(rs_job_t *job);
126 static rs_result rs_delta_s_flush(rs_job_t *job);
127 static rs_result rs_delta_s_end(rs_job_t *job);
128 void rs_getinput(rs_job_t *job);
129 static inline int rs_findmatch(rs_job_t *job, rs_long_t *match_pos, size_t *match_len);
130 static inline rs_result rs_appendmatch(rs_job_t *job, rs_long_t match_pos, size_t match_len);
131 static inline rs_result rs_appendmiss(rs_job_t *job, size_t miss_len);
132 static inline rs_result rs_appendflush(rs_job_t *job);
133 static inline rs_result rs_processmatch(rs_job_t *job);
134 static inline rs_result rs_processmiss(rs_job_t *job);
135 
136 /**
137  * \brief Get a block of data if possible, and see if it matches.
138  *
139  * On each call, we try to process all of the input data available on the
140  * scoop and input buffer. */
141 static rs_result rs_delta_s_scan(rs_job_t *job)
142 {
143  rs_long_t match_pos;
144  size_t match_len;
145  rs_result result;
146  Rollsum test;
147 
148  rs_job_check(job);
149  /* read the input into the scoop */
150  rs_getinput(job);
151  /* output any pending output from the tube */
152  result=rs_tube_catchup(job);
153  /* while output is not blocked and there is a block of data */
154  while ((result==RS_DONE) &&
155  ((job->scoop_pos + job->block_len) < job->scoop_avail)) {
156  /* check if this block matches */
157  if (rs_findmatch(job,&match_pos,&match_len)) {
158  /* append the match and reset the weak_sum */
159  result=rs_appendmatch(job,match_pos,match_len);
160  RollsumInit(&job->weak_sum);
161  } else {
162  /* rotate the weak_sum and append the miss byte */
163  RollsumRotate(&job->weak_sum,job->scoop_next[job->scoop_pos],
164  job->scoop_next[job->scoop_pos+job->block_len]);
165  result=rs_appendmiss(job,1);
166  if (rs_roll_paranoia) {
167  RollsumInit(&test);
168  RollsumUpdate(&test, job->scoop_next+job->scoop_pos,
169  job->block_len);
170  if (RollsumDigest(&test) != RollsumDigest(&job->weak_sum)) {
171  rs_fatal("mismatch between rolled sum %#x and check %#x",
172  (int)RollsumDigest(&job->weak_sum),
173  (int)RollsumDigest(&test));
174  }
175 
176  }
177  }
178  }
179  /* if we completed OK */
180  if (result==RS_DONE) {
181  /* if we reached eof, we can flush the last fragment */
182  if (job->stream->eof_in) {
183  job->statefn=rs_delta_s_flush;
184  return RS_RUNNING;
185  } else {
186  /* we are blocked waiting for more data */
187  return RS_BLOCKED;
188  }
189  }
190  return result;
191 }
192 
193 
194 static rs_result rs_delta_s_flush(rs_job_t *job)
195 {
196  rs_long_t match_pos;
197  size_t match_len;
198  rs_result result;
199 
200  rs_job_check(job);
201  /* read the input into the scoop */
202  rs_getinput(job);
203  /* output any pending output */
204  result=rs_tube_catchup(job);
205  /* while output is not blocked and there is any remaining data */
206  while ((result==RS_DONE) && (job->scoop_pos < job->scoop_avail)) {
207  /* check if this block matches */
208  if (rs_findmatch(job,&match_pos,&match_len)) {
209  /* append the match and reset the weak_sum */
210  result=rs_appendmatch(job,match_pos,match_len);
211  RollsumInit(&job->weak_sum);
212  } else {
213  /* rollout from weak_sum and append the miss byte */
214  RollsumRollout(&job->weak_sum,job->scoop_next[job->scoop_pos]);
215  rs_trace("block reduced to %d", (int)job->weak_sum.count);
216  result=rs_appendmiss(job,1);
217  }
218  }
219  /* if we are not blocked, flush and set end statefn. */
220  if (result==RS_DONE) {
221  result=rs_appendflush(job);
222  job->statefn=rs_delta_s_end;
223  }
224  if (result==RS_DONE) {
225  return RS_RUNNING;
226  }
227  return result;
228 }
229 
230 
231 static rs_result rs_delta_s_end(rs_job_t *job)
232 {
233  rs_emit_end_cmd(job);
234  return RS_DONE;
235 }
236 
237 
238 void rs_getinput(rs_job_t *job) {
239  size_t len;
240 
241  len=rs_scoop_total_avail(job);
242  if (job->scoop_avail < len) {
243  rs_scoop_input(job,len);
244  }
245 }
246 
247 
248 /**
249  * find a match at scoop_pos, returning the match_pos and match_len.
250  * Note that this will calculate weak_sum if required. It will also
251  * determine the match_len.
252  *
253  * Note that this routine could be modified to do xdelta style matches that
254  * would extend matches past block boundaries by matching backwards and
255  * forwards beyond the block boundaries. Extending backwards would require
256  * decrementing scoop_pos as appropriate.
257  */
258 inline int rs_findmatch(rs_job_t *job, rs_long_t *match_pos, size_t *match_len) {
259  /* calculate the weak_sum if we don't have one */
260  if (job->weak_sum.count == 0) {
261  /* set match_len to min(block_len, scan_avail) */
262  *match_len=job->scoop_avail - job->scoop_pos;
263  if (*match_len > job->block_len) {
264  *match_len = job->block_len;
265  }
266  /* Update the weak_sum */
267  RollsumUpdate(&job->weak_sum,job->scoop_next+job->scoop_pos,*match_len);
268  rs_trace("calculate weak sum from scratch length %d",(int)job->weak_sum.count);
269  } else {
270  /* set the match_len to the weak_sum count */
271  *match_len=job->weak_sum.count;
272  }
273  return rs_search_for_block(RollsumDigest(&job->weak_sum),
274  job->scoop_next+job->scoop_pos,
275  *match_len,
276  job->signature,
277  &job->stats,
278  match_pos);
279 }
280 
281 
282 /**
283  * Append a match at match_pos of length match_len to the delta, extending
284  * a previous match if possible, or flushing any previous miss/match. */
285 inline rs_result rs_appendmatch(rs_job_t *job, rs_long_t match_pos, size_t match_len)
286 {
287  rs_result result=RS_DONE;
288 
289  /* if last was a match that can be extended, extend it */
290  if (job->basis_len && (job->basis_pos + job->basis_len) == match_pos) {
291  job->basis_len+=match_len;
292  } else {
293  /* else appendflush the last value */
294  result=rs_appendflush(job);
295  /* make this the new match value */
296  job->basis_pos=match_pos;
297  job->basis_len=match_len;
298  }
299  /* increment scoop_pos to point at next unscanned data */
300  job->scoop_pos+=match_len;
301  /* we can only process from the scoop if output is not blocked */
302  if (result==RS_DONE) {
303  /* process the match data off the scoop*/
304  result=rs_processmatch(job);
305  }
306  return result;
307 }
308 
309 
310 /**
311  * Append a miss of length miss_len to the delta, extending a previous miss
312  * if possible, or flushing any previous match.
313  *
314  * This also breaks misses up into block_len segments to avoid accumulating
315  * too much in memory. */
316 inline rs_result rs_appendmiss(rs_job_t *job, size_t miss_len)
317 {
318  rs_result result=RS_DONE;
319 
320  /* if last was a match, or block_len misses, appendflush it */
321  if (job->basis_len || (job->scoop_pos >= rs_outbuflen)) {
322  result=rs_appendflush(job);
323  }
324  /* increment scoop_pos */
325  job->scoop_pos+=miss_len;
326  return result;
327 }
328 
329 
330 /**
331  * Flush any accumulating hit or miss, appending it to the delta.
332  */
333 inline rs_result rs_appendflush(rs_job_t *job)
334 {
335  /* if last is a match, emit it and reset last by resetting basis_len */
336  if (job->basis_len) {
337  rs_trace("matched " PRINTF_FORMAT_U64 " bytes at " PRINTF_FORMAT_U64 "!",
338  PRINTF_CAST_U64(job->basis_len),
339  PRINTF_CAST_U64(job->basis_pos));
340  rs_emit_copy_cmd(job, job->basis_pos, job->basis_len);
341  job->basis_len=0;
342  return rs_processmatch(job);
343  /* else if last is a miss, emit and process it*/
344  } else if (job->scoop_pos) {
345  rs_trace("got %ld bytes of literal data", (long) job->scoop_pos);
346  rs_emit_literal_cmd(job, job->scoop_pos);
347  return rs_processmiss(job);
348  }
349  /* otherwise, nothing to flush so we are done */
350  return RS_DONE;
351 }
352 
353 
354 /**
355  * The scoop contains match data at scoop_next of length scoop_pos. This
356  * function processes that match data, returning RS_DONE if it completes,
357  * or RS_BLOCKED if it gets blocked. After it completes scoop_pos is reset
358  * to still point at the next unscanned data.
359  *
360  * This function currently just removes data from the scoop and adjusts
361  * scoop_pos appropriately. In the future this could be used for something
362  * like context compressing of miss data. Note that it also calls
363  * rs_tube_catchup to output any pending output. */
364 inline rs_result rs_processmatch(rs_job_t *job)
365 {
366  job->scoop_avail-=job->scoop_pos;
367  job->scoop_next+=job->scoop_pos;
368  job->scoop_pos=0;
369  return rs_tube_catchup(job);
370 }
371 
372 /**
373  * The scoop contains miss data at scoop_next of length scoop_pos. This
374  * function processes that miss data, returning RS_DONE if it completes, or
375  * RS_BLOCKED if it gets blocked. After it completes scoop_pos is reset to
376  * still point at the next unscanned data.
377  *
378  * This function uses rs_tube_copy to queue copying from the scoop into
379  * output. and uses rs_tube_catchup to do the copying. This automaticly
380  * removes data from the scoop, but this can block. While rs_tube_catchup
381  * is blocked, scoop_pos does not point at legit data, so scanning can also
382  * not proceed.
383  *
384  * In the future this could do compression of miss data before outputing
385  * it. */
386 inline rs_result rs_processmiss(rs_job_t *job)
387 {
388  rs_tube_copy(job, job->scoop_pos);
389  job->scoop_pos=0;
390  return rs_tube_catchup(job);
391 }
392 
393 
394 /**
395  * \brief State function that does a slack delta containing only
396  * literal data to recreate the input.
397  */
398 static rs_result rs_delta_s_slack(rs_job_t *job)
399 {
400  rs_buffers_t * const stream = job->stream;
401  size_t avail = stream->avail_in;
402 
403  if (avail) {
404  rs_trace("emit slack delta for " PRINTF_FORMAT_U64
405  " available bytes", PRINTF_CAST_U64(avail));
406  rs_emit_literal_cmd(job, avail);
407  rs_tube_copy(job, avail);
408  return RS_RUNNING;
409  } else {
410  if (rs_job_input_is_ending(job)) {
411  job->statefn = rs_delta_s_end;
412  return RS_RUNNING;
413  } else {
414  return RS_BLOCKED;
415  }
416  }
417 }
418 
419 
420 /**
421  * State function for writing out the header of the encoding job.
422  */
423 static rs_result rs_delta_s_header(rs_job_t *job)
424 {
425  rs_emit_delta_header(job);
426 
427  if (job->block_len) {
428  if (!job->signature) {
429  rs_error("no signature is loaded into the job");
430  return RS_PARAM_ERROR;
431  }
432  job->statefn = rs_delta_s_scan;
433  } else {
434  rs_trace("block length is zero for this delta; "
435  "therefore using slack deltas");
436  job->statefn = rs_delta_s_slack;
437  }
438 
439  return RS_RUNNING;
440 }
441 
442 
444 {
445  /* Caller must have called rs_build_hash_table() by now */
446  if (!sig->tag_table)
447  rs_fatal("Must call rs_build_hash_table() prior to calling rs_delta_begin()");
448 
449  rs_job_t *job;
450 
451  job = rs_job_new("delta", rs_delta_s_header);
452  job->signature = sig;
453 
454  RollsumInit(&job->weak_sum);
455 
456  if ((job->block_len = sig->block_len) < 0) {
457  rs_log(RS_LOG_ERR, "unreasonable block_len %d in signature",
458  job->block_len);
459  return NULL;
460  }
461 
462  job->strong_sum_len = sig->strong_sum_len;
463  if (job->strong_sum_len < 0 || job->strong_sum_len > RS_MAX_STRONG_SUM_LENGTH) {
464  rs_log(RS_LOG_ERR, "unreasonable strong_sum_len %d in signature",
465  job->strong_sum_len);
466  return NULL;
467  }
468 
469  switch (job->signature->magic) {
470  case RS_BLAKE2_SIG_MAGIC:
471  case RS_MD4_SIG_MAGIC:
472  break;
473  default:
474  rs_error("Unknown signature algorithm %#x", job->signature->magic);
475  return NULL;
476  }
477 
478  return job;
479 }
Description of input and output buffers.
Definition: librsync.h:360
A signature file using the BLAKE2 hash.
Definition: librsync.h:106
Bad value passed in to library, probably an application bug.
Definition: librsync.h:216
rs_signature_t * signature
Signature that&#39;s either being read in, or used for generating a delta.
Definition: job.h:51
long long rs_long_t
A long integer type that can handle the largest file offsets.
size_t avail_in
Number of bytes available at next_in References the length of available input.
Definition: librsync.h:375
rs_long_t basis_pos
Copy from the basis position.
Definition: job.h:92
A signature file with MD4 signatures.
Definition: librsync.h:97
rs_result(* statefn)(rs_job_t *)
Callback for each processing step.
Definition: job.h:38
rs_stats_t stats
Encoding statistics.
Definition: job.h:69
Error conditions.
Definition: librsync.h:122
Public header for librsync.
Rollsum weak_sum
The rollsum weak signature accumulator used by delta.c.
Definition: job.h:60
rs_result
Return codes from nonblocking rsync operations.
Definition: librsync.h:192
rs_job_t * rs_delta_begin(rs_signature_t *)
Prepare to compute a streaming delta.
Definition: delta.c:443
Blocked waiting for more data.
Definition: librsync.h:194
The job is still running, and not yet finished or blocked.
Definition: librsync.h:198
int eof_in
True if there is no more data after this.
Definition: librsync.h:380
Completed successfully.
Definition: librsync.h:193
The contents of this structure are private.
Definition: job.h:29