for_merge.c File Reference

#include <limits.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <error.h>
#include <errno.h>
#include <string.h>
#include <stdint.h>
#include <fcntl.h>
#include <pthread.h>
#include <getopt.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/stat.h>

Go to the source code of this file.

Data Structures

struct  rlimit
struct  line
struct  buffer
 Input buffers. More...
struct  mergefile

Defines

#define _FILE_OFFSET_BITS   64
 Program for merging some sorted files with large sizes.
#define getrlimit(Resource, Rlp)   (-1)
#define program_name   "for_merge"
#define NONZERO(x)   ((x) != 0)
#define MAX(a, b)   ((a) > (b) ? (a) : (b))
#define MIN(a, b)   (((a) < (b)) ? (a) : (b))
#define MIN_MERGE_BUFFER_SIZE   (2 + sizeof (struct line))
#define MIN_MERGE_SIZE   (16 * MIN_MERGE_BUFFER_SIZE)
#define EXIT_SUCCESS   0
#define EXIT_FAILURE   1

Enumerations

enum  { SORT_OUT_OF_ORDER = 1, MERGE_FAILURE = 2 }

Functions

static void die (char const *message, char const *file)
void * xmalloc (size_t n, size_t s)
void usage (int status)
static void write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
double physmem_total ()
double physmem_available ()
static size_t get_memory_available (void)
static void initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
static struct linebuffer_linelim (struct buffer const *buf)
void * fill_input_buffer_thread (void *args)
static int compare (const struct line *a, const struct line *b)
void reconstruct_table (size_t nfiles, struct line const **cur, size_t *ord)
static size_t open_input_files (struct mergefile *files, size_t nfiles, FILE ***pfps, char *const *f)
void fill_input_buffer (struct buffer *buf, struct buffer *buf2, FILE *fp, char const *file)
bool swap_buffers (struct buffer *from, struct buffer *to)
static void merge_fps (struct mergefile *files, size_t nfiles, FILE *ofp, char const *output_file, FILE **fps)
int main (int argc, char **argv)

Variables

static char eolchar = '\n'
static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024)
static size_t out_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024)
bool flag_fcst_buffer_ready
pthread_cond_t flag_buffer_ready_cv
pthread_mutex_t flag_buffer_ready_mutex
bool flag_fcst_buffer_start
pthread_cond_t flag_fillbuf_start_cv
pthread_mutex_t flag_fillbuf_start_mutex
bufferfillbuf_buf
bufferfillbuf_buf2
FILE * fillbuf_fp
char const * fillbuf_file
bool fillbuf_ret
static char const short_options [] = "o:s:S:zh"
static struct option const long_options []


Define Documentation

#define _FILE_OFFSET_BITS   64

Program for merging some sorted files with large sizes.

Use a Forecasting merge algorithm, in the style suggested by Knuth volume 3 (2nd edition), exercise 5.4.6F. Use the optimization suggested by exercise 5.4.9.

This method keeps track of the buffer that will be emptied first and uses an extra buffer to read the appropriate next part from the disk, while the contents of the remaining buffers are continued to be processed.

Definition at line 11 of file for_merge.c.

#define EXIT_FAILURE   1

Definition at line 116 of file for_merge.c.

#define EXIT_SUCCESS   0

Some systems do not define EXIT_*, despite otherwise supporting C89.

Definition at line 115 of file for_merge.c.

Referenced by main(), and usage().

#define getrlimit ( Resource,
Rlp   )     (-1)

Definition at line 42 of file for_merge.c.

Referenced by get_memory_available().

#define MAX ( a,
 )     ((a) > (b) ? (a) : (b))

Definition at line 95 of file for_merge.c.

Referenced by get_memory_available(), and main().

#define MIN ( a,
 )     (((a) < (b)) ? (a) : (b))

Definition at line 99 of file for_merge.c.

Referenced by compare().

#define MIN_MERGE_BUFFER_SIZE   (2 + sizeof (struct line))

Minimum size for a buffer.

Definition at line 103 of file for_merge.c.

#define MIN_MERGE_SIZE   (16 * MIN_MERGE_BUFFER_SIZE)

Minimum size for a merge.

Definition at line 106 of file for_merge.c.

Referenced by get_memory_available(), and main().

#define NONZERO (  )     ((x) != 0)

Definition at line 57 of file for_merge.c.

Referenced by compare().

#define program_name   "for_merge"

Definition at line 45 of file for_merge.c.

Referenced by usage().


Enumeration Type Documentation

anonymous enum

Exit statuses.

Enumerator:
SORT_OUT_OF_ORDER  POSIX says to exit with status 1 if invoked with -c and the input is not properly sorted.
MERGE_FAILURE  POSIX says any other irregular exit must exit with a status code greater than 1.

Definition at line 48 of file for_merge.c.

00049 {
00050     /*! POSIX says to exit with status 1 if invoked with -c and the input is not properly sorted.  */
00051     SORT_OUT_OF_ORDER = 1,
00052 
00053     /*! POSIX says any other irregular exit must exit with a status code greater than 1.  */
00054     MERGE_FAILURE = 2
00055 };


Function Documentation

static struct line* buffer_linelim ( struct buffer const *  buf  )  [inline, static]

Return one past the limit of the line array.

Definition at line 312 of file for_merge.c.

References buffer::alloc, and buffer::buf.

Referenced by fill_input_buffer_thread(), and merge_fps().

00313 {
00314     return (struct line *) (buf->buf + buf->alloc);
00315 }

static int compare ( const struct line a,
const struct line b 
) [static]

Compare two lines A and B, returning negative, zero, or positive depending on whether A compares less than, equal to, or greater than B.

Definition at line 432 of file for_merge.c.

References line::length, MIN, NONZERO, and line::text.

Referenced by merge_fps(), and reconstruct_table().

00433 {
00434     int diff;
00435     size_t alen, blen;
00436 
00437     /* If the keys all compare equal (or no keys were specified)
00438        fall through to the default comparison.  */
00439     alen = a->length - 1, blen = b->length - 1;
00440 
00441     if (alen == 0)
00442         diff = -NONZERO (blen);
00443     else if (blen == 0)
00444         diff = 1;
00445     else if (!(diff = memcmp (a->text, b->text, MIN (alen, blen))))
00446         diff = alen < blen ? -1 : alen != blen;
00447 
00448     return diff;
00449 }

static void die ( char const *  message,
char const *  file 
) [static]

Report MESSAGE for FILE, then clean up and exit. If FILE is null, it represents standard output.

Definition at line 135 of file for_merge.c.

References MERGE_FAILURE.

Referenced by fill_input_buffer_thread(), main(), merge_fps(), and write_bytes().

00136 {
00137     error (MERGE_FAILURE, errno, "%s: %s", message, file ? file : ("standard output"));
00138 }

void fill_input_buffer ( struct buffer buf,
struct buffer buf2,
FILE *  fp,
char const *  file 
)

Start filling input buffer in background thread. Use second buffer BUF2 to check whether it has some unprocessed data. If it has - copy this data to the begin of the first buffer.

Definition at line 503 of file for_merge.c.

References fillbuf_buf, fillbuf_buf2, fillbuf_file, fillbuf_fp, flag_fcst_buffer_start, flag_fillbuf_start_cv, and flag_fillbuf_start_mutex.

Referenced by merge_fps().

00504 {
00505     pthread_mutex_lock (&flag_fillbuf_start_mutex);
00506     fillbuf_buf = buf;
00507     fillbuf_buf2 = buf2;
00508     fillbuf_fp = fp;
00509     fillbuf_file = file;
00510     flag_fcst_buffer_start = true;
00511     pthread_cond_signal (&flag_fillbuf_start_cv);
00512     pthread_mutex_unlock (&flag_fillbuf_start_mutex);
00513 }

void* fill_input_buffer_thread ( void *  args  ) 

Fill forecast buffer in background.

Definition at line 320 of file for_merge.c.

References buffer::buf, buffer_linelim(), die(), buffer::eof, eolchar, fillbuf_buf, fillbuf_buf2, fillbuf_file, fillbuf_fp, fillbuf_ret, flag_buffer_ready_cv, flag_buffer_ready_mutex, flag_fcst_buffer_ready, flag_fcst_buffer_start, flag_fillbuf_start_cv, flag_fillbuf_start_mutex, buffer::left, line::length, buffer::line_bytes, buffer::nlines, line::text, and buffer::used.

Referenced by main().

00321 {
00322     char eol = eolchar;
00323 
00324     while (1)
00325     {
00326 
00327         pthread_mutex_lock (&flag_fillbuf_start_mutex);
00328         while (!flag_fcst_buffer_start)
00329             pthread_cond_wait (&flag_fillbuf_start_cv, &flag_fillbuf_start_mutex);
00330 
00331         struct buffer *buf = fillbuf_buf;
00332         struct buffer *buf2 = fillbuf_buf2;
00333         FILE *fp = fillbuf_fp;
00334         char const *file = fillbuf_file;
00335         flag_fcst_buffer_start = false;
00336         fillbuf_ret = false;
00337         pthread_mutex_unlock(&flag_fillbuf_start_mutex);
00338 
00339         size_t line_bytes = buf->line_bytes;
00340 
00341         if (buf->eof)
00342             goto fillbuf_ret_false;
00343 
00344         //  check unprocessed symbols
00345         if (buf2->used != buf2->left)
00346         {
00347             memmove (buf->buf, buf2->buf + buf2->used - buf2->left, buf2->left);
00348             buf->used = buf2->left;
00349             buf->nlines = 0;
00350             buf->left = buf2->left;
00351         }
00352 
00353         for (;;)
00354         {
00355             char *ptr = buf->buf + buf->used;
00356             struct line *linelim = buffer_linelim (buf);
00357             struct line *line = linelim - buf->nlines;
00358             size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
00359             char *line_start = buf->nlines ? line->text + line->length : buf->buf;
00360 
00361             while (line_bytes + 1 < avail)
00362             {
00363                 /* Read as many bytes as possible, but do not read so many
00364                    bytes that there might not be enough room for the
00365                    corresponding line array. */
00366                 size_t readsize = (avail - 1) / (line_bytes + 1);
00367 
00368                 size_t bytes_read = fread (ptr, 1, readsize, fp);
00369                 char *ptrlim = ptr + bytes_read;
00370                 char *p;
00371                 avail -= bytes_read;
00372 
00373                 if (bytes_read != readsize)
00374                 {
00375                     if (ferror (fp))
00376                         die (("read failed"), file);
00377                     if (feof (fp))
00378                     {
00379                         buf->eof = true;
00380                         if (buf->buf == ptrlim)
00381                             goto fillbuf_ret_false;
00382                         if (ptrlim[-1] != eol)
00383                             *ptrlim++ = eol;
00384                     }
00385                 }
00386 
00387                 /* Find and record each line in the just-read input.  */
00388                 while ((p = memchr (ptr, eol, ptrlim - ptr)))
00389                 {
00390                     ptr = p + 1;
00391                     line--;
00392                     line->text = line_start;
00393                     line->length = ptr - line_start;
00394                     avail -= line_bytes;
00395 
00396                     line_start = ptr;
00397                 }
00398 
00399                 ptr = ptrlim;
00400                 if (buf->eof)
00401                     break;
00402             }
00403 
00404             buf->used = ptr - buf->buf;
00405             buf->nlines = buffer_linelim (buf) - line;
00406             if (buf->nlines != 0)
00407             {
00408                 buf->left = ptr - line_start;
00409                 goto fillbuf_ret_true;
00410             }
00411         }
00412 
00413       fillbuf_ret_true:
00414 
00415         fillbuf_ret = true;
00416 
00417       fillbuf_ret_false:
00418 
00419         pthread_mutex_lock (&flag_buffer_ready_mutex);
00420         flag_fcst_buffer_ready = true;
00421         pthread_cond_signal (&flag_buffer_ready_cv);
00422         pthread_mutex_unlock (&flag_buffer_ready_mutex);
00423 
00424     }                           // while (1)
00425 }

static size_t get_memory_available ( void   )  [static]

Return size of the available memory.

Definition at line 252 of file for_merge.c.

References getrlimit, MAX, MIN_MERGE_SIZE, physmem_available(), physmem_total(), and rlimit::rlim_cur.

Referenced by main().

00253 {
00254 
00255     /* Let MEM be available memory or 1/8 of total memory, whichever
00256        is greater.  */
00257     double avail = physmem_available ();
00258     double total = physmem_total ();
00259     double mem = MAX (avail, total / 8);
00260     struct rlimit rlimit;
00261 
00262     /* Let SIZE be MEM, but no more than the maximum object size or
00263        system resource limits.  Avoid the MIN macro here, as it is not
00264        quite right when only one argument is floating point.  Don't
00265        bother to check for values like RLIM_INFINITY since in practice
00266        they are not much less than SIZE_MAX.  */
00267     size_t size = SIZE_MAX;
00268     if (mem < size)
00269         size = mem;
00270     if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
00271         size = rlimit.rlim_cur;
00272 #ifdef RLIMIT_AS
00273     if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
00274         size = rlimit.rlim_cur;
00275 #endif
00276 
00277     /* Leave a large safety margin for the above limits, as failure can occur when they are exceeded.  */
00278     size /= 2;
00279 
00280 #ifdef RLIMIT_RSS
00281     /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
00282        Exceeding RSS is not fatal, but can be quite slow.  */
00283     if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
00284         size = rlimit.rlim_cur / 16 * 15;
00285 #endif
00286 
00287     /* Use no less than the minimum.  */
00288     return MAX (size, MIN_MERGE_SIZE);
00289 }

static void initbuf ( struct buffer buf,
size_t  line_bytes,
size_t  alloc 
) [static]

Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES must be at least sizeof (struct line). Allocate ALLOC bytes initially.

Definition at line 295 of file for_merge.c.

References buffer::alloc, buffer::buf, buffer::eof, buffer::left, buffer::line_bytes, MERGE_FAILURE, buffer::nlines, and buffer::used.

Referenced by merge_fps().

00296 {
00297     /* Ensure that the line array is properly aligned. */
00298     alloc += sizeof (struct line) - alloc % sizeof (struct line);
00299     buf->buf = malloc (alloc);
00300     if (!buf->buf)
00301         error (MERGE_FAILURE, 0, "-memory exhausted");
00302 
00303     buf->line_bytes = line_bytes;
00304     buf->alloc = alloc;
00305     buf->used = buf->left = buf->nlines = 0;
00306     buf->eof = false;
00307 }

int main ( int  argc,
char **  argv 
)

Definition at line 720 of file for_merge.c.

References die(), eolchar, EXIT_SUCCESS, fill_input_buffer_thread(), fillbuf_buf, fillbuf_buf2, fillbuf_file, fillbuf_fp, fillbuf_ret, flag_buffer_ready_cv, flag_buffer_ready_mutex, flag_fcst_buffer_ready, flag_fcst_buffer_start, flag_fillbuf_start_cv, flag_fillbuf_start_mutex, get_memory_available(), long_options, MAX, merge_buffer_size, MERGE_FAILURE, merge_fps(), MIN_MERGE_SIZE, mergefile::name, open_input_files(), out_buffer_size, short_options, usage(), and xmalloc().

00721 {
00722     int c = 0;
00723     size_t nfiles = 0;
00724     char **ifiles;
00725     char const *output_file = NULL;
00726     size_t merge_size = 0;
00727     size_t out_size = 0;
00728     FILE **fps;
00729 
00730     ifiles = (char **) xmalloc (argc, sizeof *ifiles);
00731 
00732     for (;;)
00733     {
00734         int oi = -1;
00735 
00736         if (c == -1 || ((c = getopt_long (argc, argv, short_options, long_options, &oi)) == -1))
00737         {
00738             if (argc <= optind)
00739                 break;
00740             ifiles[nfiles++] = argv[optind++];
00741         }
00742         else
00743             switch (c)
00744             {
00745             case 1:
00746                 ifiles[nfiles++] = optarg;
00747                 break;
00748 
00749             case 'o':
00750                 if (output_file && !strcmp (output_file, optarg))
00751                     error (MERGE_FAILURE, 0, ("multiple output files specified"));
00752                 output_file = optarg;
00753                 break;
00754 
00755             case 's':
00756                 {
00757                     char *endptr;
00758                     merge_size = strtol (optarg, &endptr, 10);
00759                     if ((errno == ERANGE && (merge_size == LONG_MAX || merge_size == LONG_MIN)) || (errno != 0 && merge_size == 0))
00760                     {
00761                         error (MERGE_FAILURE, 0, ("strtol"));
00762                     }
00763                     break;
00764                 }
00765 
00766             case 'S':
00767                 {
00768                     char *endptr;
00769                     out_size = strtol (optarg, &endptr, 10);
00770                     if ((errno == ERANGE && (out_size == LONG_MAX || out_size == LONG_MIN)) || (errno != 0 && out_size == 0))
00771                     {
00772                         error (MERGE_FAILURE, 0, ("strtol"));
00773                     }
00774                     break;
00775                 }
00776 
00777             case 'z':
00778                 eolchar = 0;
00779                 break;
00780 
00781             case 'h':
00782                 usage (EXIT_SUCCESS);
00783                 break;
00784 
00785             default:
00786                 usage (EXIT_SUCCESS);
00787             }
00788     }
00789 
00790     if (nfiles < 2)
00791         error (MERGE_FAILURE, 0, ("More input files are required for merging"));
00792 
00793     setenv ("LC_ALL", "C", 1);
00794 
00795     size_t memory_available = get_memory_available ();
00796     if (merge_size > 0)
00797     {
00798         //  the user has specified a size
00799         merge_size = MAX (merge_size, MIN_MERGE_SIZE);
00800         merge_buffer_size = merge_size;
00801     }
00802     else
00803     {
00804         merge_buffer_size = MAX (merge_buffer_size, memory_available / (2 * nfiles + 2));
00805     }
00806 
00807     if (out_size > 0)
00808     {
00809         //  the user has specified a size
00810         out_size = MAX (out_size, out_buffer_size);
00811         out_buffer_size = out_size;
00812     }
00813     else
00814     {
00815         out_buffer_size = MAX (out_buffer_size, memory_available / (2 * nfiles + 2));
00816     }
00817 
00818     fprintf (stdout, "        merge_size = %ld\n", (long int) merge_size);
00819     fprintf (stdout, "          out_size = %ld\n", (long int) out_size);
00820     fprintf (stdout, "  memory_available = %ld\n", (long int) memory_available);
00821     fprintf (stdout, " merge_buffer_size = %ld\n", (long int) merge_buffer_size);
00822     fprintf (stdout, "   out_buffer_size = %ld\n", (long int) out_buffer_size);
00823 
00824 
00825     flag_fcst_buffer_ready = false;
00826     pthread_cond_init (&flag_buffer_ready_cv, NULL);
00827     pthread_mutex_init (&flag_buffer_ready_mutex, NULL);
00828 
00829     flag_fcst_buffer_start = false;
00830     fillbuf_buf = NULL;
00831     fillbuf_buf2 = NULL;
00832     fillbuf_fp = NULL;
00833     fillbuf_file = NULL;
00834     fillbuf_ret = false;
00835     pthread_cond_init (&flag_fillbuf_start_cv, NULL);
00836     pthread_mutex_init (&flag_fillbuf_start_mutex, NULL);
00837 
00838     pthread_t fillbuf_thread_id;
00839     pthread_create (&fillbuf_thread_id, NULL, &fill_input_buffer_thread, NULL);
00840 
00841 
00842     struct mergefile *files = (struct mergefile *) calloc (nfiles, sizeof *files);
00843     if (!files)
00844         error (MERGE_FAILURE, 0, "memory exhausted");
00845 
00846     size_t i;
00847     for (i = 0; i < nfiles; ++i)
00848         files[i].name = ifiles[i];
00849 
00850     size_t nopened = open_input_files (files, nfiles, &fps, ifiles);
00851 
00852     if (nopened == nfiles)
00853     {
00854         FILE *ofp = fopen (output_file, "w");
00855         if (ofp)
00856         {
00857             merge_fps (files, nfiles, ofp, output_file, fps);
00858         }
00859         else if (errno != EMFILE || nopened <= 2)
00860             die (("open failed"), output_file);
00861     }
00862     else if (nopened <= 2)
00863         die (("open failed"), files[nopened].name);
00864 
00865     pthread_cancel (fillbuf_thread_id);
00866     pthread_join (fillbuf_thread_id, NULL);
00867 
00868     free (files);
00869 
00870     exit (EXIT_SUCCESS);
00871 }

static void merge_fps ( struct mergefile files,
size_t  nfiles,
FILE *  ofp,
char const *  output_file,
FILE **  fps 
) [static]

Merge lines from FILES onto OFP. NFILES is the number of files; FPS is the vector of open stream corresponding to the files. Close input and output streams before returning. OUTPUT_FILE gives the name of the output file.

< Input buffers for each file.

< Current line in each line table.

< Base of each line table.

< Table representing a permutation of fps, such that cur[ord[0]] is the smallest line and will be next output.

< Buffers for forecasting.

< Table representing a permutation of buffers' ends, such that buffer_fcst[ord_base[0]] is the buffer with smallest line and will be emptied first (and should be filled first)

output buffer

Definition at line 544 of file for_merge.c.

References buffer_linelim(), compare(), die(), fill_input_buffer(), initbuf(), line::length, merge_buffer_size, MERGE_FAILURE, out_buffer_size, reconstruct_table(), swap_buffers(), line::text, write_bytes(), and xmalloc().

Referenced by main().

00545 {
00546     struct buffer *buffer = (struct buffer *) xmalloc (nfiles, sizeof *buffer); /*!< Input buffers for each file. */
00547     struct line const **cur = (struct line const **) xmalloc (nfiles, sizeof *cur); /*!< Current line in each line table. */
00548     struct line const **base = (struct line const **) xmalloc (nfiles, sizeof *base); /*!< Base of each line table.  */
00549     size_t *ord = (size_t *) xmalloc (nfiles, sizeof *ord); /*!< Table representing a permutation of fps, such that cur[ord[0]] is the smallest line and will be next output. */
00550     struct buffer *buffer_fcst = (struct buffer *) xmalloc (nfiles, sizeof *buffer); /*!< Buffers for forecasting. */
00551     size_t *ord_base = (size_t *) xmalloc (nfiles, sizeof *ord_base); /*!< Table representing a permutation of buffers' ends, such that buffer_fcst[ord_base[0]] is the buffer with smallest line and will be emptied first (and should be filled first) */
00552 
00553 
00554     size_t i;
00555     size_t j;
00556     size_t t;
00557 
00558     /*! output buffer */
00559     char *out_buf = malloc (out_buffer_size);
00560     if (!out_buf)
00561         error (MERGE_FAILURE, 0, "memory exhausted");
00562     char *out_buf_cur = out_buf;
00563     size_t out_buf_len_cur = out_buffer_size;
00564 
00565     /* Prepare buffers and read initial lines from each input file. */
00566     for (i = 0; i < nfiles;)
00567     {
00568         initbuf (&buffer[i], sizeof (struct line), merge_buffer_size);
00569         initbuf (&buffer_fcst[i], sizeof (struct line), merge_buffer_size);
00570 
00571         fill_input_buffer (&buffer_fcst[i], &buffer[i], fps[i], files[i].name);
00572 
00573         if (swap_buffers (&buffer_fcst[i], &buffer[i]))
00574         {
00575             struct line const *linelim = buffer_linelim (&buffer[i]);
00576             cur[i] = linelim - 1;
00577             base[i] = linelim - buffer[i].nlines;
00578             i++;
00579         }
00580         else
00581         {
00582             // fps[i] is empty; eliminate it from future consideration.  
00583             if (fclose (fps[i]) != 0)
00584                 die (("close failed"), files[i].name);
00585             free (buffer[i].buf);
00586             free (buffer_fcst[i].buf);
00587             --nfiles;
00588             for (j = i; j < nfiles; ++j)
00589             {
00590                 files[j] = files[j + 1];
00591                 fps[j] = fps[j + 1];
00592             }
00593         }
00594 
00595     }
00596 
00597     /* Set up the ord table according to comparisons among input lines.
00598        Since this only reorders two items if one is strictly greater than
00599        the other, it is stable. */
00600     for (i = 0; i < nfiles; ++i) {
00601         ord[i] = i;
00602     }
00603     for (i = 1; i < nfiles; ++i) {
00604         if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
00605             t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
00606     }
00607 
00608     /* Set up the ord_base table according to comparisons among input lines. */
00609     for (i = 0; i < nfiles; ++i)
00610         ord_base[i] = i;
00611     for (i = 1; i < nfiles; ++i)
00612         if (0 < compare (base[ord_base[i - 1]], base[ord_base[i]]))
00613             t = ord_base[i - 1], ord_base[i - 1] = ord_base[i], ord_base[i] = t, i = 0;
00614 
00615     fill_input_buffer (&buffer_fcst[ord_base[0]], &buffer[ord_base[0]], fps[ord_base[0]], files[ord_base[0]].name);
00616 
00617     /* Repeatedly output the smallest line until no input remains. */
00618     while (nfiles)
00619     {
00620         struct line const *smallest = cur[ord[0]];
00621 
00622         if (out_buf_len_cur > smallest->length)
00623         {
00624             memmove (out_buf_cur, smallest->text, smallest->length);
00625             out_buf_len_cur -= smallest->length;
00626             out_buf_cur += smallest->length;
00627         }
00628         else
00629         {
00630             write_bytes (out_buf, out_buffer_size - out_buf_len_cur, ofp, output_file);
00631             out_buf_len_cur = out_buffer_size;
00632             out_buf_cur = out_buf;
00633 
00634             memmove (out_buf_cur, smallest->text, smallest->length);
00635             out_buf_len_cur -= smallest->length;
00636             out_buf_cur += smallest->length;
00637         }
00638 
00639         /* Check if we need to read more lines into core. */
00640         if (base[ord[0]] < smallest)
00641             cur[ord[0]] = smallest - 1;
00642         else
00643         {
00644             if (buffer[ord[0]].eof)
00645             {
00646                 //We reached EOF on fps[ord[0]].  
00647                 for (i = 1; i < nfiles; ++i)
00648                     if (ord[i] > ord[0])
00649                         --ord[i];
00650 
00651                 for (i = 1; i < nfiles; ++i)
00652                     if (ord_base[i] > ord_base[0])
00653                         --ord_base[i];
00654 
00655                 --nfiles;
00656                 if (fclose (fps[ord[0]]) != 0)
00657                     die (("close failed"), files[ord[0]].name);
00658 
00659                 free (buffer[ord[0]].buf);
00660                 for (i = ord[0]; i < nfiles; ++i)
00661                 {
00662                     fps[i] = fps[i + 1];
00663                     files[i] = files[i + 1];
00664                     buffer[i] = buffer[i + 1];
00665                     cur[i] = cur[i + 1];
00666                     base[i] = base[i + 1];
00667                 }
00668                 for (i = 0; i < nfiles; ++i)
00669                     ord[i] = ord[i + 1];
00670 
00671                 free (buffer_fcst[ord_base[0]].buf);
00672                 for (i = ord_base[0]; i < nfiles; ++i)
00673                 {
00674                     buffer_fcst[i] = buffer_fcst[i + 1];
00675                 }
00676                 for (i = 0; i < nfiles; ++i)
00677                     ord_base[i] = ord_base[i + 1];
00678 
00679                 continue;
00680             }
00681             else
00682             {
00683                 /* The end of buffer has just been reached.
00684                    Forecasted buffer has already been filled and ready to use. */
00685                 bool return_code = swap_buffers (&buffer_fcst[ord_base[0]], &buffer[ord[0]]);
00686 
00687                 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
00688                 cur[ord[0]] = linelim - 1;
00689                 base[ord[0]] = linelim - buffer[ord[0]].nlines;
00690 
00691                 reconstruct_table (nfiles, base, ord_base);
00692 
00693                 if (return_code)
00694                     fill_input_buffer (&buffer_fcst[ord_base[0]], &buffer[ord_base[0]], fps[ord_base[0]], files[ord_base[0]].name);
00695             }
00696         }
00697         reconstruct_table (nfiles, cur, ord);
00698     }                           // while (nfiles)
00699 
00700 
00701     if (out_buffer_size != out_buf_len_cur)
00702     {
00703         write_bytes (out_buf, out_buffer_size - out_buf_len_cur, ofp, output_file);
00704     }
00705 
00706     if (fclose (ofp) != 0)
00707         die (("close failed"), output_file);
00708     free (fps);
00709     free (buffer);
00710     free (buffer_fcst);
00711     free (ord);
00712     free (ord_base);
00713     free (base);
00714     free (cur);
00715     free (out_buf);
00716 }

static size_t open_input_files ( struct mergefile files,
size_t  nfiles,
FILE ***  pfps,
char *const *  f 
) [static]

Open FILES (there are NFILES of them) and store the resulting array of stream pointers into (*PFPS). Allocate the array. Return the number of successfully opened files, setting errno if this value is less than NFILES.

Definition at line 483 of file for_merge.c.

References xmalloc().

Referenced by main().

00484 {
00485     FILE **fps = *pfps = xmalloc (nfiles, sizeof *fps);
00486     int i;
00487 
00488     /* Open input files with mmap for reading. */
00489     for (i = 0; i < nfiles; i++)
00490     {
00491         fps[i] = fopen (files[i].name, "rm");
00492         if (!fps[i])
00493             break;
00494     }
00495 
00496     return i;
00497 }

double physmem_available (  ) 

Return the amount of physical memory available.

Definition at line 236 of file for_merge.c.

Referenced by get_memory_available().

00237 {
00238 #if defined _SC_AVPHYS_PAGES && defined _SC_PAGESIZE
00239     {                           /* This works on linux-gnu, solaris2 and cygwin.  */
00240         double pages = sysconf (_SC_AVPHYS_PAGES);
00241         double pagesize = sysconf (_SC_PAGESIZE);
00242         if (0 <= pages && 0 <= pagesize)
00243             return pages * pagesize;
00244     }
00245 #endif
00246     return 0;
00247 
00248 }

double physmem_total (  ) 

Return the total amount of physical memory.

Definition at line 220 of file for_merge.c.

Referenced by get_memory_available().

00221 {
00222 #if defined _SC_PHYS_PAGES && defined _SC_PAGESIZE
00223     {                           /* This works on linux-gnu, solaris2 and cygwin.  */
00224         double pages = sysconf (_SC_PHYS_PAGES);
00225         double pagesize = sysconf (_SC_PAGESIZE);
00226         if (0 <= pages && 0 <= pagesize)
00227             return pages * pagesize;
00228     }
00229 #endif
00230     return 0;
00231 }

void reconstruct_table ( size_t  nfiles,
struct line const **  cur,
size_t *  ord 
) [inline]

The new line just read in may be larger than other lines already in main memory; push it back in the queue until we encounter a line larger than it. Optimize for the common case where the new line is smallest.

Definition at line 454 of file for_merge.c.

References compare().

Referenced by merge_fps().

00455 {
00456     int j;
00457     size_t lo = 1;
00458     size_t hi = nfiles;
00459     size_t probe = lo;
00460     size_t ord0 = ord[0];
00461     size_t count_of_smaller_lines;
00462 
00463     while (lo < hi)
00464     {
00465         int cmp = compare (cur[ord0], cur[ord[probe]]);
00466         if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
00467             hi = probe;
00468         else
00469             lo = probe + 1;
00470         probe = (lo + hi) / 2;
00471     }
00472 
00473     count_of_smaller_lines = lo - 1;
00474     for (j = 0; j < count_of_smaller_lines; j++)
00475         ord[j] = ord[j + 1];
00476     ord[count_of_smaller_lines] = ord0;
00477 }

bool swap_buffers ( struct buffer from,
struct buffer to 
)

Swap buffers: first one has new data, second has old data.

Definition at line 517 of file for_merge.c.

References buffer::alloc, buffer::buf, buffer::eof, fillbuf_ret, flag_buffer_ready_cv, flag_buffer_ready_mutex, flag_fcst_buffer_ready, buffer::left, buffer::line_bytes, buffer::nlines, and buffer::used.

Referenced by merge_fps().

00518 {
00519     pthread_mutex_lock (&flag_buffer_ready_mutex);
00520     while (!flag_fcst_buffer_ready)
00521         pthread_cond_wait (&flag_buffer_ready_cv, &flag_buffer_ready_mutex);
00522     flag_fcst_buffer_ready = false;
00523     bool return_code = fillbuf_ret;
00524 
00525     to->line_bytes = from->line_bytes;
00526     to->alloc = from->alloc;
00527     to->used = from->used;
00528     to->left = from->left;
00529     to->nlines = from->nlines;
00530     to->eof = from->eof;
00531     char *tmp = to->buf;
00532     to->buf = from->buf;
00533     from->buf = tmp;
00534 
00535     pthread_mutex_unlock (&flag_buffer_ready_mutex);
00536     
00537     return return_code;
00538 }

void usage ( int  status  ) 

Definition at line 152 of file for_merge.c.

References EXIT_SUCCESS, and program_name.

Referenced by main().

00153 {
00154     if (status != EXIT_SUCCESS)
00155         fprintf (stderr, "Try `%s --help' for more information.\n", program_name);
00156     else
00157     {
00158         printf (("\
00159 Usage: %s [OPTION]... [FILE1]...\n\
00160 "), program_name);
00161         fputs (("\
00162 Write concatenation of sorted FILEs to file.\n\
00163 \n\
00164 "), stdout);
00165         fputs (("\
00166 Mandatory arguments to long options are mandatory for short options too.\n\
00167 "), stdout);
00168         fputs (("\
00169 Options:\n\
00170 \n\
00171 "), stdout);
00172         fputs (("\
00173   -o, --output=OFILE        MANDATORY - write result to OFILE\n\
00174   -s, --buffer-size=SIZE    use SIZE bytes for input buffer (2*SIZE for each input file)\n\
00175   -S, --output-size=SIZE    use SIZE bytes for output buffer\n\
00176   -z, --zero-terminated     end lines with 0 byte, not newline\n\
00177   -h, --help                print this help\n\
00178 "), stdout);
00179         fputs (("\
00180 \n\
00181 \n\
00182 *** WARNING ***\n\
00183 The locale specified by the environment doesn't affect sort order.\n\
00184 LC_ALL=C is set to get the traditional sort order that uses\n\
00185 native byte values.\n\
00186 *** NOTE ***\n\
00187 Forecasting merge algorithm (P-way merge) \n\
00188 is required 2*P input buffers, where P - number of sorted input files.\n\
00189 "), stdout);
00190     }
00191 
00192     exit (status);
00193 }

static void write_bytes ( const char *  buf,
size_t  n_bytes,
FILE *  fp,
const char *  output_file 
) [static]

Definition at line 212 of file for_merge.c.

References die().

Referenced by merge_fps().

00213 {
00214     if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
00215         die (("write failed"), output_file);
00216 }

void* xmalloc ( size_t  n,
size_t  s 
)

Allocate N bytes of memory dynamically, with error checking.

Definition at line 142 of file for_merge.c.

References MERGE_FAILURE.

Referenced by main(), merge_fps(), and open_input_files().

00143 {
00144     size_t ns = n * s;
00145     void *p = malloc (ns);
00146     if (!p && ns != 0)
00147         error (MERGE_FAILURE, 0, "memory exhausted");
00148     return p;
00149 }


Variable Documentation

char eolchar = '\n' [static]

The character marking end of line. Default to
.

Definition at line 60 of file for_merge.c.

Referenced by fill_input_buffer_thread(), and main().

struct buffer* fillbuf_buf

Definition at line 127 of file for_merge.c.

Referenced by fill_input_buffer(), fill_input_buffer_thread(), and main().

struct buffer* fillbuf_buf2

Definition at line 128 of file for_merge.c.

Referenced by fill_input_buffer(), fill_input_buffer_thread(), and main().

char const* fillbuf_file

Definition at line 130 of file for_merge.c.

Referenced by fill_input_buffer(), fill_input_buffer_thread(), and main().

FILE* fillbuf_fp

Definition at line 129 of file for_merge.c.

Referenced by fill_input_buffer(), fill_input_buffer_thread(), and main().

bool fillbuf_ret

Definition at line 131 of file for_merge.c.

Referenced by fill_input_buffer_thread(), main(), and swap_buffers().

pthread_cond_t flag_buffer_ready_cv

Definition at line 120 of file for_merge.c.

Referenced by fill_input_buffer_thread(), main(), and swap_buffers().

pthread_mutex_t flag_buffer_ready_mutex

Definition at line 121 of file for_merge.c.

Referenced by fill_input_buffer_thread(), main(), and swap_buffers().

bool flag_fcst_buffer_ready

This flag says that input buffer was filled and ready to use.

Definition at line 119 of file for_merge.c.

Referenced by fill_input_buffer_thread(), main(), and swap_buffers().

bool flag_fcst_buffer_start

This flag wakes up thread to fill input forecasted buffer from disk.

Definition at line 124 of file for_merge.c.

Referenced by fill_input_buffer(), fill_input_buffer_thread(), and main().

pthread_cond_t flag_fillbuf_start_cv

Definition at line 125 of file for_merge.c.

Referenced by fill_input_buffer(), fill_input_buffer_thread(), and main().

pthread_mutex_t flag_fillbuf_start_mutex

Definition at line 126 of file for_merge.c.

Referenced by fill_input_buffer(), fill_input_buffer_thread(), and main().

struct option const long_options[] [static]

Initial value:

 {
    {"output", required_argument, NULL, 'o'},
    {"buffer-size", required_argument, NULL, 's'},
    {"output-size", required_argument, NULL, 'S'},
    {"zero-terminated", no_argument, NULL, 'z'},
    {"help", no_argument, NULL, 'h'},
    {NULL, 0, NULL, 0},
}

Definition at line 197 of file for_merge.c.

Referenced by main().

size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024) [static]

The number of bytes needed for a merge.

Definition at line 109 of file for_merge.c.

Referenced by main(), and merge_fps().

size_t out_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024) [static]

The number of bytes needed for output buffer.

Definition at line 112 of file for_merge.c.

Referenced by main(), and merge_fps().

char const short_options[] = "o:s:S:zh" [static]

Definition at line 195 of file for_merge.c.

Referenced by main().


Generated on Mon Aug 24 18:22:18 2009 for for_merge - merging some sorted files with large sizes by  doxygen 1.4.7