for_merge.c

Go to the documentation of this file.
00001 //! Program for merging some sorted files with large sizes.
00002 /*! Use a Forecasting merge algorithm, in the style
00003    suggested by Knuth volume 3 (2nd edition), exercise 5.4.6F.  Use
00004    the optimization suggested by exercise 5.4.9.
00005 
00006    This method keeps track of the buffer that will be emptied first
00007    and uses an extra buffer to read the appropriate next part from the disk,
00008    while the contents of the remaining buffers are continued to be processed.
00009 */
00010 
00011 #define _FILE_OFFSET_BITS 64
00012 #undef ENABLE_NLS
00013 
00014 #include <limits.h>
00015 #include <stddef.h>
00016 #include <stdbool.h>
00017 #include <stdlib.h>
00018 #include <stdio.h>
00019 #include <signal.h>
00020 #include <error.h>
00021 #include <errno.h>
00022 #include <string.h>
00023 #include <stdint.h>
00024 #include <fcntl.h>
00025 #include <pthread.h>
00026 #include <getopt.h>
00027 #include <unistd.h>
00028 #include <sys/wait.h>
00029 #include <sys/param.h>
00030 #include <sys/types.h>
00031 #include <sys/stat.h>
00032 
00033 #if HAVE_SYS_RESOURCE_H
00034 # include <sys/resource.h>
00035 #endif
00036 
00037 #ifndef RLIMIT_DATA
00038 struct rlimit
00039 {
00040     size_t rlim_cur;
00041 };
00042 # define getrlimit(Resource, Rlp) (-1)
00043 #endif
00044 
00045 #define program_name "for_merge"
00046 
00047 /*! Exit statuses.  */
00048 enum
00049 {
00050     /*! POSIX says to exit with status 1 if invoked with -c and the input is not properly sorted.  */
00051     SORT_OUT_OF_ORDER = 1,
00052 
00053     /*! POSIX says any other irregular exit must exit with a status code greater than 1.  */
00054     MERGE_FAILURE = 2
00055 };
00056 
00057 #define NONZERO(x) ((x) != 0)
00058 
00059 /*! The character marking end of line. Default to \n. */
00060 static char eolchar = '\n';
00061 
00062 /*! Lines are held in core as counted strings. */
00063 struct line
00064 {
00065     char *text;                 /*!< Text of the line. */
00066     size_t length;              /*!< Length including final newline. */
00067 };
00068 
00069 //! Input buffers.
00070 /*! | | | | | | | | | | | | | | | | | | ||||||||||||||
00071  *  |                                   |            |
00072  *  |      buffer's data                | lines data |
00073  *  |                                   |            |
00074  *  |0 1 2 3 4 ...                      |   ... 43210|
00075  *  ^                                   ^        ^   ^
00076  *  |                                   |        |   |
00077  *  buf                                base     cur  linelim
00078  */
00079 struct buffer
00080 {
00081     char *buf;                  /*!< Dynamically allocated buffer,
00082                                    partitioned into 3 regions:
00083                                    - input data;
00084                                    - unused area;
00085                                    - an array of lines, in reverse order.  */
00086     size_t used;                /*!< Number of bytes used for input data.  */
00087     size_t nlines;              /*!< Number of lines in the line array.  */
00088     size_t alloc;               /*!< Number of bytes allocated. */
00089     size_t left;                /*!< Number of bytes left from previous reads. */
00090     size_t line_bytes;          /*!< Number of bytes to reserve for each line. */
00091     bool eof;                   /*!< An EOF has been read.  */
00092 };
00093 
00094 #ifndef MAX
00095 # define MAX(a, b) ((a) > (b) ? (a) : (b))
00096 #endif
00097 
00098 #ifndef MIN
00099 # define MIN(a,b) (((a) < (b)) ? (a) : (b))
00100 #endif
00101 
00102 /*! Minimum size for a buffer.  */
00103 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
00104 
00105 /*! Minimum size for a merge.  */
00106 #define MIN_MERGE_SIZE (16 * MIN_MERGE_BUFFER_SIZE)
00107 
00108 /*! The number of bytes needed for a merge.  */
00109 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
00110 
00111 /*! The number of bytes needed for output buffer.  */
00112 static size_t out_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
00113 
00114 /*! Some systems do not define EXIT_*, despite otherwise supporting C89.  */
00115 # define EXIT_SUCCESS 0
00116 # define EXIT_FAILURE 1
00117 
00118 /*! This flag says that input buffer was filled and ready to use. */
00119 bool flag_fcst_buffer_ready;
00120 pthread_cond_t flag_buffer_ready_cv;
00121 pthread_mutex_t flag_buffer_ready_mutex;
00122 
00123 /*! This flag wakes up thread to fill input forecasted buffer from disk. */
00124 bool flag_fcst_buffer_start;
00125 pthread_cond_t flag_fillbuf_start_cv;
00126 pthread_mutex_t flag_fillbuf_start_mutex;
00127 struct buffer *fillbuf_buf;
00128 struct buffer *fillbuf_buf2;
00129 FILE *fillbuf_fp;
00130 char const *fillbuf_file;
00131 bool fillbuf_ret;
00132 
00133 /*! Report MESSAGE for FILE, then clean up and exit. If FILE is null, it represents standard output.  */
00134 static void
00135 die (char const *message, char const *file)
00136 {
00137     error (MERGE_FAILURE, errno, "%s: %s", message, file ? file : ("standard output"));
00138 }
00139 
00140 /*! Allocate N bytes of memory dynamically, with error checking.  */
00141 void *
00142 xmalloc (size_t n, size_t s)
00143 {
00144     size_t ns = n * s;
00145     void *p = malloc (ns);
00146     if (!p && ns != 0)
00147         error (MERGE_FAILURE, 0, "memory exhausted");
00148     return p;
00149 }
00150 
00151 void
00152 usage (int status)
00153 {
00154     if (status != EXIT_SUCCESS)
00155         fprintf (stderr, "Try `%s --help' for more information.\n", program_name);
00156     else
00157     {
00158         printf (("\
00159 Usage: %s [OPTION]... [FILE1]...\n\
00160 "), program_name);
00161         fputs (("\
00162 Write concatenation of sorted FILEs to file.\n\
00163 \n\
00164 "), stdout);
00165         fputs (("\
00166 Mandatory arguments to long options are mandatory for short options too.\n\
00167 "), stdout);
00168         fputs (("\
00169 Options:\n\
00170 \n\
00171 "), stdout);
00172         fputs (("\
00173   -o, --output=OFILE        MANDATORY - write result to OFILE\n\
00174   -s, --buffer-size=SIZE    use SIZE bytes for input buffer (2*SIZE for each input file)\n\
00175   -S, --output-size=SIZE    use SIZE bytes for output buffer\n\
00176   -z, --zero-terminated     end lines with 0 byte, not newline\n\
00177   -h, --help                print this help\n\
00178 "), stdout);
00179         fputs (("\
00180 \n\
00181 \n\
00182 *** WARNING ***\n\
00183 The locale specified by the environment doesn't affect sort order.\n\
00184 LC_ALL=C is set to get the traditional sort order that uses\n\
00185 native byte values.\n\
00186 *** NOTE ***\n\
00187 Forecasting merge algorithm (P-way merge) \n\
00188 is required 2*P input buffers, where P - number of sorted input files.\n\
00189 "), stdout);
00190     }
00191 
00192     exit (status);
00193 }
00194 
00195 static char const short_options[] = "o:s:S:zh";
00196 
00197 static struct option const long_options[] = {
00198     {"output", required_argument, NULL, 'o'},
00199     {"buffer-size", required_argument, NULL, 's'},
00200     {"output-size", required_argument, NULL, 'S'},
00201     {"zero-terminated", no_argument, NULL, 'z'},
00202     {"help", no_argument, NULL, 'h'},
00203     {NULL, 0, NULL, 0},
00204 };
00205 
00206 struct mergefile
00207 {
00208     char const *name;
00209 };
00210 
00211 static void
00212 write_bytes (const char *buf, size_t n_bytes, FILE * fp, const char *output_file)
00213 {
00214     if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
00215         die (("write failed"), output_file);
00216 }
00217 
00218 /*! Return the total amount of physical memory.  */
00219 double
00220 physmem_total ()
00221 {
00222 #if defined _SC_PHYS_PAGES && defined _SC_PAGESIZE
00223     {                           /* This works on linux-gnu, solaris2 and cygwin.  */
00224         double pages = sysconf (_SC_PHYS_PAGES);
00225         double pagesize = sysconf (_SC_PAGESIZE);
00226         if (0 <= pages && 0 <= pagesize)
00227             return pages * pagesize;
00228     }
00229 #endif
00230     return 0;
00231 }
00232 
00233 
00234 /*! Return the amount of physical memory available.  */
00235 double
00236 physmem_available ()
00237 {
00238 #if defined _SC_AVPHYS_PAGES && defined _SC_PAGESIZE
00239     {                           /* This works on linux-gnu, solaris2 and cygwin.  */
00240         double pages = sysconf (_SC_AVPHYS_PAGES);
00241         double pagesize = sysconf (_SC_PAGESIZE);
00242         if (0 <= pages && 0 <= pagesize)
00243             return pages * pagesize;
00244     }
00245 #endif
00246     return 0;
00247 
00248 }
00249 
00250 /*! Return size of the available memory.  */
00251 static size_t
00252 get_memory_available (void)
00253 {
00254 
00255     /* Let MEM be available memory or 1/8 of total memory, whichever
00256        is greater.  */
00257     double avail = physmem_available ();
00258     double total = physmem_total ();
00259     double mem = MAX (avail, total / 8);
00260     struct rlimit rlimit;
00261 
00262     /* Let SIZE be MEM, but no more than the maximum object size or
00263        system resource limits.  Avoid the MIN macro here, as it is not
00264        quite right when only one argument is floating point.  Don't
00265        bother to check for values like RLIM_INFINITY since in practice
00266        they are not much less than SIZE_MAX.  */
00267     size_t size = SIZE_MAX;
00268     if (mem < size)
00269         size = mem;
00270     if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
00271         size = rlimit.rlim_cur;
00272 #ifdef RLIMIT_AS
00273     if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
00274         size = rlimit.rlim_cur;
00275 #endif
00276 
00277     /* Leave a large safety margin for the above limits, as failure can occur when they are exceeded.  */
00278     size /= 2;
00279 
00280 #ifdef RLIMIT_RSS
00281     /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
00282        Exceeding RSS is not fatal, but can be quite slow.  */
00283     if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
00284         size = rlimit.rlim_cur / 16 * 15;
00285 #endif
00286 
00287     /* Use no less than the minimum.  */
00288     return MAX (size, MIN_MERGE_SIZE);
00289 }
00290 
00291 /*! Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
00292    must be at least sizeof (struct line).  Allocate ALLOC bytes initially. */
00293 
00294 static void
00295 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
00296 {
00297     /* Ensure that the line array is properly aligned. */
00298     alloc += sizeof (struct line) - alloc % sizeof (struct line);
00299     buf->buf = malloc (alloc);
00300     if (!buf->buf)
00301         error (MERGE_FAILURE, 0, "-memory exhausted");
00302 
00303     buf->line_bytes = line_bytes;
00304     buf->alloc = alloc;
00305     buf->used = buf->left = buf->nlines = 0;
00306     buf->eof = false;
00307 }
00308 
00309 /*! Return one past the limit of the line array.  */
00310 
00311 static inline struct line *
00312 buffer_linelim (struct buffer const *buf)
00313 {
00314     return (struct line *) (buf->buf + buf->alloc);
00315 }
00316 
00317 
00318 /*! Fill forecast buffer in background. */
00319 void *
00320 fill_input_buffer_thread (void *args)
00321 {
00322     char eol = eolchar;
00323 
00324     while (1)
00325     {
00326 
00327         pthread_mutex_lock (&flag_fillbuf_start_mutex);
00328         while (!flag_fcst_buffer_start)
00329             pthread_cond_wait (&flag_fillbuf_start_cv, &flag_fillbuf_start_mutex);
00330 
00331         struct buffer *buf = fillbuf_buf;
00332         struct buffer *buf2 = fillbuf_buf2;
00333         FILE *fp = fillbuf_fp;
00334         char const *file = fillbuf_file;
00335         flag_fcst_buffer_start = false;
00336         fillbuf_ret = false;
00337         pthread_mutex_unlock(&flag_fillbuf_start_mutex);
00338 
00339         size_t line_bytes = buf->line_bytes;
00340 
00341         if (buf->eof)
00342             goto fillbuf_ret_false;
00343 
00344         //  check unprocessed symbols
00345         if (buf2->used != buf2->left)
00346         {
00347             memmove (buf->buf, buf2->buf + buf2->used - buf2->left, buf2->left);
00348             buf->used = buf2->left;
00349             buf->nlines = 0;
00350             buf->left = buf2->left;
00351         }
00352 
00353         for (;;)
00354         {
00355             char *ptr = buf->buf + buf->used;
00356             struct line *linelim = buffer_linelim (buf);
00357             struct line *line = linelim - buf->nlines;
00358             size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
00359             char *line_start = buf->nlines ? line->text + line->length : buf->buf;
00360 
00361             while (line_bytes + 1 < avail)
00362             {
00363                 /* Read as many bytes as possible, but do not read so many
00364                    bytes that there might not be enough room for the
00365                    corresponding line array. */
00366                 size_t readsize = (avail - 1) / (line_bytes + 1);
00367 
00368                 size_t bytes_read = fread (ptr, 1, readsize, fp);
00369                 char *ptrlim = ptr + bytes_read;
00370                 char *p;
00371                 avail -= bytes_read;
00372 
00373                 if (bytes_read != readsize)
00374                 {
00375                     if (ferror (fp))
00376                         die (("read failed"), file);
00377                     if (feof (fp))
00378                     {
00379                         buf->eof = true;
00380                         if (buf->buf == ptrlim)
00381                             goto fillbuf_ret_false;
00382                         if (ptrlim[-1] != eol)
00383                             *ptrlim++ = eol;
00384                     }
00385                 }
00386 
00387                 /* Find and record each line in the just-read input.  */
00388                 while ((p = memchr (ptr, eol, ptrlim - ptr)))
00389                 {
00390                     ptr = p + 1;
00391                     line--;
00392                     line->text = line_start;
00393                     line->length = ptr - line_start;
00394                     avail -= line_bytes;
00395 
00396                     line_start = ptr;
00397                 }
00398 
00399                 ptr = ptrlim;
00400                 if (buf->eof)
00401                     break;
00402             }
00403 
00404             buf->used = ptr - buf->buf;
00405             buf->nlines = buffer_linelim (buf) - line;
00406             if (buf->nlines != 0)
00407             {
00408                 buf->left = ptr - line_start;
00409                 goto fillbuf_ret_true;
00410             }
00411         }
00412 
00413       fillbuf_ret_true:
00414 
00415         fillbuf_ret = true;
00416 
00417       fillbuf_ret_false:
00418 
00419         pthread_mutex_lock (&flag_buffer_ready_mutex);
00420         flag_fcst_buffer_ready = true;
00421         pthread_cond_signal (&flag_buffer_ready_cv);
00422         pthread_mutex_unlock (&flag_buffer_ready_mutex);
00423 
00424     }                           // while (1)
00425 }
00426 
00427 
00428 /*! Compare two lines A and B, returning negative, zero, or positive
00429    depending on whether A compares less than, equal to, or greater than B. */
00430 
00431 static int
00432 compare (const struct line *a, const struct line *b)
00433 {
00434     int diff;
00435     size_t alen, blen;
00436 
00437     /* If the keys all compare equal (or no keys were specified)
00438        fall through to the default comparison.  */
00439     alen = a->length - 1, blen = b->length - 1;
00440 
00441     if (alen == 0)
00442         diff = -NONZERO (blen);
00443     else if (blen == 0)
00444         diff = 1;
00445     else if (!(diff = memcmp (a->text, b->text, MIN (alen, blen))))
00446         diff = alen < blen ? -1 : alen != blen;
00447 
00448     return diff;
00449 }
00450 
00451 /*! The new line just read in may be larger than other lines already in main memory; push it back in the queue until we
00452     encounter a line larger than it.  Optimize for the common case where the new line is smallest.  */
00453 inline void
00454 reconstruct_table (size_t nfiles, struct line const **cur, size_t * ord)
00455 {
00456     int j;
00457     size_t lo = 1;
00458     size_t hi = nfiles;
00459     size_t probe = lo;
00460     size_t ord0 = ord[0];
00461     size_t count_of_smaller_lines;
00462 
00463     while (lo < hi)
00464     {
00465         int cmp = compare (cur[ord0], cur[ord[probe]]);
00466         if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
00467             hi = probe;
00468         else
00469             lo = probe + 1;
00470         probe = (lo + hi) / 2;
00471     }
00472 
00473     count_of_smaller_lines = lo - 1;
00474     for (j = 0; j < count_of_smaller_lines; j++)
00475         ord[j] = ord[j + 1];
00476     ord[count_of_smaller_lines] = ord0;
00477 }
00478 
00479 /*! Open FILES (there are NFILES of them) and store the resulting array of stream pointers into (*PFPS).  Allocate the array.  Return the
00480    number of successfully opened files, setting errno if this value is less than NFILES. */
00481 
00482 static size_t
00483 open_input_files (struct mergefile *files, size_t nfiles, FILE *** pfps, char *const *f)
00484 {
00485     FILE **fps = *pfps = xmalloc (nfiles, sizeof *fps);
00486     int i;
00487 
00488     /* Open input files with mmap for reading. */
00489     for (i = 0; i < nfiles; i++)
00490     {
00491         fps[i] = fopen (files[i].name, "rm");
00492         if (!fps[i])
00493             break;
00494     }
00495 
00496     return i;
00497 }
00498 
00499 /*! Start filling input buffer in background thread.
00500     Use second buffer BUF2 to check whether it has some unprocessed data.
00501     If it has - copy this data to the begin of the first buffer. */
00502 void
00503 fill_input_buffer (struct buffer *buf, struct buffer *buf2, FILE * fp, char const *file)
00504 {
00505     pthread_mutex_lock (&flag_fillbuf_start_mutex);
00506     fillbuf_buf = buf;
00507     fillbuf_buf2 = buf2;
00508     fillbuf_fp = fp;
00509     fillbuf_file = file;
00510     flag_fcst_buffer_start = true;
00511     pthread_cond_signal (&flag_fillbuf_start_cv);
00512     pthread_mutex_unlock (&flag_fillbuf_start_mutex);
00513 }
00514 
00515 /*! Swap buffers: first one has new data, second has old data. */
00516 bool
00517 swap_buffers (struct buffer *from, struct buffer *to)
00518 {
00519     pthread_mutex_lock (&flag_buffer_ready_mutex);
00520     while (!flag_fcst_buffer_ready)
00521         pthread_cond_wait (&flag_buffer_ready_cv, &flag_buffer_ready_mutex);
00522     flag_fcst_buffer_ready = false;
00523     bool return_code = fillbuf_ret;
00524 
00525     to->line_bytes = from->line_bytes;
00526     to->alloc = from->alloc;
00527     to->used = from->used;
00528     to->left = from->left;
00529     to->nlines = from->nlines;
00530     to->eof = from->eof;
00531     char *tmp = to->buf;
00532     to->buf = from->buf;
00533     from->buf = tmp;
00534 
00535     pthread_mutex_unlock (&flag_buffer_ready_mutex);
00536     
00537     return return_code;
00538 }
00539 
00540 /*! Merge lines from FILES onto OFP.  NFILES is the number of files; FPS is the vector of open stream corresponding to the files.
00541    Close input and output streams before returning. OUTPUT_FILE gives the name of the output file. */
00542 
00543 static void
00544 merge_fps (struct mergefile *files, size_t nfiles, FILE * ofp, char const *output_file, FILE ** fps)
00545 {
00546     struct buffer *buffer = (struct buffer *) xmalloc (nfiles, sizeof *buffer); /*!< Input buffers for each file. */
00547     struct line const **cur = (struct line const **) xmalloc (nfiles, sizeof *cur); /*!< Current line in each line table. */
00548     struct line const **base = (struct line const **) xmalloc (nfiles, sizeof *base); /*!< Base of each line table.  */
00549     size_t *ord = (size_t *) xmalloc (nfiles, sizeof *ord); /*!< Table representing a permutation of fps, such that cur[ord[0]] is the smallest line and will be next output. */
00550     struct buffer *buffer_fcst = (struct buffer *) xmalloc (nfiles, sizeof *buffer); /*!< Buffers for forecasting. */
00551     size_t *ord_base = (size_t *) xmalloc (nfiles, sizeof *ord_base); /*!< Table representing a permutation of buffers' ends, such that buffer_fcst[ord_base[0]] is the buffer with smallest line and will be emptied first (and should be filled first) */
00552 
00553 
00554     size_t i;
00555     size_t j;
00556     size_t t;
00557 
00558     /*! output buffer */
00559     char *out_buf = malloc (out_buffer_size);
00560     if (!out_buf)
00561         error (MERGE_FAILURE, 0, "memory exhausted");
00562     char *out_buf_cur = out_buf;
00563     size_t out_buf_len_cur = out_buffer_size;
00564 
00565     /* Prepare buffers and read initial lines from each input file. */
00566     for (i = 0; i < nfiles;)
00567     {
00568         initbuf (&buffer[i], sizeof (struct line), merge_buffer_size);
00569         initbuf (&buffer_fcst[i], sizeof (struct line), merge_buffer_size);
00570 
00571         fill_input_buffer (&buffer_fcst[i], &buffer[i], fps[i], files[i].name);
00572 
00573         if (swap_buffers (&buffer_fcst[i], &buffer[i]))
00574         {
00575             struct line const *linelim = buffer_linelim (&buffer[i]);
00576             cur[i] = linelim - 1;
00577             base[i] = linelim - buffer[i].nlines;
00578             i++;
00579         }
00580         else
00581         {
00582             // fps[i] is empty; eliminate it from future consideration.  
00583             if (fclose (fps[i]) != 0)
00584                 die (("close failed"), files[i].name);
00585             free (buffer[i].buf);
00586             free (buffer_fcst[i].buf);
00587             --nfiles;
00588             for (j = i; j < nfiles; ++j)
00589             {
00590                 files[j] = files[j + 1];
00591                 fps[j] = fps[j + 1];
00592             }
00593         }
00594 
00595     }
00596 
00597     /* Set up the ord table according to comparisons among input lines.
00598        Since this only reorders two items if one is strictly greater than
00599        the other, it is stable. */
00600     for (i = 0; i < nfiles; ++i) {
00601         ord[i] = i;
00602     }
00603     for (i = 1; i < nfiles; ++i) {
00604         if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
00605             t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
00606     }
00607 
00608     /* Set up the ord_base table according to comparisons among input lines. */
00609     for (i = 0; i < nfiles; ++i)
00610         ord_base[i] = i;
00611     for (i = 1; i < nfiles; ++i)
00612         if (0 < compare (base[ord_base[i - 1]], base[ord_base[i]]))
00613             t = ord_base[i - 1], ord_base[i - 1] = ord_base[i], ord_base[i] = t, i = 0;
00614 
00615     fill_input_buffer (&buffer_fcst[ord_base[0]], &buffer[ord_base[0]], fps[ord_base[0]], files[ord_base[0]].name);
00616 
00617     /* Repeatedly output the smallest line until no input remains. */
00618     while (nfiles)
00619     {
00620         struct line const *smallest = cur[ord[0]];
00621 
00622         if (out_buf_len_cur > smallest->length)
00623         {
00624             memmove (out_buf_cur, smallest->text, smallest->length);
00625             out_buf_len_cur -= smallest->length;
00626             out_buf_cur += smallest->length;
00627         }
00628         else
00629         {
00630             write_bytes (out_buf, out_buffer_size - out_buf_len_cur, ofp, output_file);
00631             out_buf_len_cur = out_buffer_size;
00632             out_buf_cur = out_buf;
00633 
00634             memmove (out_buf_cur, smallest->text, smallest->length);
00635             out_buf_len_cur -= smallest->length;
00636             out_buf_cur += smallest->length;
00637         }
00638 
00639         /* Check if we need to read more lines into core. */
00640         if (base[ord[0]] < smallest)
00641             cur[ord[0]] = smallest - 1;
00642         else
00643         {
00644             if (buffer[ord[0]].eof)
00645             {
00646                 //We reached EOF on fps[ord[0]].  
00647                 for (i = 1; i < nfiles; ++i)
00648                     if (ord[i] > ord[0])
00649                         --ord[i];
00650 
00651                 for (i = 1; i < nfiles; ++i)
00652                     if (ord_base[i] > ord_base[0])
00653                         --ord_base[i];
00654 
00655                 --nfiles;
00656                 if (fclose (fps[ord[0]]) != 0)
00657                     die (("close failed"), files[ord[0]].name);
00658 
00659                 free (buffer[ord[0]].buf);
00660                 for (i = ord[0]; i < nfiles; ++i)
00661                 {
00662                     fps[i] = fps[i + 1];
00663                     files[i] = files[i + 1];
00664                     buffer[i] = buffer[i + 1];
00665                     cur[i] = cur[i + 1];
00666                     base[i] = base[i + 1];
00667                 }
00668                 for (i = 0; i < nfiles; ++i)
00669                     ord[i] = ord[i + 1];
00670 
00671                 free (buffer_fcst[ord_base[0]].buf);
00672                 for (i = ord_base[0]; i < nfiles; ++i)
00673                 {
00674                     buffer_fcst[i] = buffer_fcst[i + 1];
00675                 }
00676                 for (i = 0; i < nfiles; ++i)
00677                     ord_base[i] = ord_base[i + 1];
00678 
00679                 continue;
00680             }
00681             else
00682             {
00683                 /* The end of buffer has just been reached.
00684                    Forecasted buffer has already been filled and ready to use. */
00685                 bool return_code = swap_buffers (&buffer_fcst[ord_base[0]], &buffer[ord[0]]);
00686 
00687                 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
00688                 cur[ord[0]] = linelim - 1;
00689                 base[ord[0]] = linelim - buffer[ord[0]].nlines;
00690 
00691                 reconstruct_table (nfiles, base, ord_base);
00692 
00693                 if (return_code)
00694                     fill_input_buffer (&buffer_fcst[ord_base[0]], &buffer[ord_base[0]], fps[ord_base[0]], files[ord_base[0]].name);
00695             }
00696         }
00697         reconstruct_table (nfiles, cur, ord);
00698     }                           // while (nfiles)
00699 
00700 
00701     if (out_buffer_size != out_buf_len_cur)
00702     {
00703         write_bytes (out_buf, out_buffer_size - out_buf_len_cur, ofp, output_file);
00704     }
00705 
00706     if (fclose (ofp) != 0)
00707         die (("close failed"), output_file);
00708     free (fps);
00709     free (buffer);
00710     free (buffer_fcst);
00711     free (ord);
00712     free (ord_base);
00713     free (base);
00714     free (cur);
00715     free (out_buf);
00716 }
00717 
00718 
00719 int
00720 main (int argc, char **argv)
00721 {
00722     int c = 0;
00723     size_t nfiles = 0;
00724     char **ifiles;
00725     char const *output_file = NULL;
00726     size_t merge_size = 0;
00727     size_t out_size = 0;
00728     FILE **fps;
00729 
00730     ifiles = (char **) xmalloc (argc, sizeof *ifiles);
00731 
00732     for (;;)
00733     {
00734         int oi = -1;
00735 
00736         if (c == -1 || ((c = getopt_long (argc, argv, short_options, long_options, &oi)) == -1))
00737         {
00738             if (argc <= optind)
00739                 break;
00740             ifiles[nfiles++] = argv[optind++];
00741         }
00742         else
00743             switch (c)
00744             {
00745             case 1:
00746                 ifiles[nfiles++] = optarg;
00747                 break;
00748 
00749             case 'o':
00750                 if (output_file && !strcmp (output_file, optarg))
00751                     error (MERGE_FAILURE, 0, ("multiple output files specified"));
00752                 output_file = optarg;
00753                 break;
00754 
00755             case 's':
00756                 {
00757                     char *endptr;
00758                     merge_size = strtol (optarg, &endptr, 10);
00759                     if ((errno == ERANGE && (merge_size == LONG_MAX || merge_size == LONG_MIN)) || (errno != 0 && merge_size == 0))
00760                     {
00761                         error (MERGE_FAILURE, 0, ("strtol"));
00762                     }
00763                     break;
00764                 }
00765 
00766             case 'S':
00767                 {
00768                     char *endptr;
00769                     out_size = strtol (optarg, &endptr, 10);
00770                     if ((errno == ERANGE && (out_size == LONG_MAX || out_size == LONG_MIN)) || (errno != 0 && out_size == 0))
00771                     {
00772                         error (MERGE_FAILURE, 0, ("strtol"));
00773                     }
00774                     break;
00775                 }
00776 
00777             case 'z':
00778                 eolchar = 0;
00779                 break;
00780 
00781             case 'h':
00782                 usage (EXIT_SUCCESS);
00783                 break;
00784 
00785             default:
00786                 usage (EXIT_SUCCESS);
00787             }
00788     }
00789 
00790     if (nfiles < 2)
00791         error (MERGE_FAILURE, 0, ("More input files are required for merging"));
00792 
00793     setenv ("LC_ALL", "C", 1);
00794 
00795     size_t memory_available = get_memory_available ();
00796     if (merge_size > 0)
00797     {
00798         //  the user has specified a size
00799         merge_size = MAX (merge_size, MIN_MERGE_SIZE);
00800         merge_buffer_size = merge_size;
00801     }
00802     else
00803     {
00804         merge_buffer_size = MAX (merge_buffer_size, memory_available / (2 * nfiles + 2));
00805     }
00806 
00807     if (out_size > 0)
00808     {
00809         //  the user has specified a size
00810         out_size = MAX (out_size, out_buffer_size);
00811         out_buffer_size = out_size;
00812     }
00813     else
00814     {
00815         out_buffer_size = MAX (out_buffer_size, memory_available / (2 * nfiles + 2));
00816     }
00817 
00818     fprintf (stdout, "        merge_size = %ld\n", (long int) merge_size);
00819     fprintf (stdout, "          out_size = %ld\n", (long int) out_size);
00820     fprintf (stdout, "  memory_available = %ld\n", (long int) memory_available);
00821     fprintf (stdout, " merge_buffer_size = %ld\n", (long int) merge_buffer_size);
00822     fprintf (stdout, "   out_buffer_size = %ld\n", (long int) out_buffer_size);
00823 
00824 
00825     flag_fcst_buffer_ready = false;
00826     pthread_cond_init (&flag_buffer_ready_cv, NULL);
00827     pthread_mutex_init (&flag_buffer_ready_mutex, NULL);
00828 
00829     flag_fcst_buffer_start = false;
00830     fillbuf_buf = NULL;
00831     fillbuf_buf2 = NULL;
00832     fillbuf_fp = NULL;
00833     fillbuf_file = NULL;
00834     fillbuf_ret = false;
00835     pthread_cond_init (&flag_fillbuf_start_cv, NULL);
00836     pthread_mutex_init (&flag_fillbuf_start_mutex, NULL);
00837 
00838     pthread_t fillbuf_thread_id;
00839     pthread_create (&fillbuf_thread_id, NULL, &fill_input_buffer_thread, NULL);
00840 
00841 
00842     struct mergefile *files = (struct mergefile *) calloc (nfiles, sizeof *files);
00843     if (!files)
00844         error (MERGE_FAILURE, 0, "memory exhausted");
00845 
00846     size_t i;
00847     for (i = 0; i < nfiles; ++i)
00848         files[i].name = ifiles[i];
00849 
00850     size_t nopened = open_input_files (files, nfiles, &fps, ifiles);
00851 
00852     if (nopened == nfiles)
00853     {
00854         FILE *ofp = fopen (output_file, "w");
00855         if (ofp)
00856         {
00857             merge_fps (files, nfiles, ofp, output_file, fps);
00858         }
00859         else if (errno != EMFILE || nopened <= 2)
00860             die (("open failed"), output_file);
00861     }
00862     else if (nopened <= 2)
00863         die (("open failed"), files[nopened].name);
00864 
00865     pthread_cancel (fillbuf_thread_id);
00866     pthread_join (fillbuf_thread_id, NULL);
00867 
00868     free (files);
00869 
00870     exit (EXIT_SUCCESS);
00871 }

Generated on Mon Aug 24 18:22:18 2009 for for_merge - merging some sorted files with large sizes by  doxygen 1.4.7