#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
/************************************************************************
htstat.c

    Summarizes statistics of an HTTP access log file

    htstat file -u[n] -f[n,l] -d[n,l] -t

    where:
        file    is the input file (if none specified, uses STDIN)

        -u      gives a report on access by user
        -f      gives a report on access by filename
        -d      gives a report on access by domain
        -t      gives a report by date (t stands for "time")

        n       is the "minimum # of hits to report" (default 1)
        l       is the "number of levels to store" (default 1)

                Example - given a user name of: nessus.vip.best.com
                storing the domain to level 1 gives "com"; this is the default
                storing the domain to level 2 gives "best.com"
                storing the domain to level 3 gives "vip.best.com"

                given a file name of:
                http://www.best.com/~nessus/jde.gif

                level 0 gives the entire file name  (inefficient!)
                level 1 gives the filename "jde.gif" (the default)
                level 2 gives the filename "~nessus/jde.gif"

   (c) 1995 J. David Eisenberg
   Note: although this program is copyright, you may feel free to
   distribute/modify the code as long as you keep the original credit
   in the program.  Please e-mail any improvements to nessus@best.com
   Thanks!
*************************************************************************/
#define TRUE    1
#define FALSE   0
#define true    1
#define false   0

#ifndef max
#define max(a,b) ((a) > (b) ? (a) : (b))
#endif

typedef short BOOLEAN;

typedef struct tagAccess ACCESS;

struct tagAccess
{
    char    *name;
    ACCESS  *lTree, *rTree;
    long    nAccesses;
};

/* domain names consisting of only digits are considered to be IP numbers */
char ANON_IP[] = "(IP number only)";

FILE    *inFile = NULL;

ACCESS  *userHead = NULL;
ACCESS  *fileHead = NULL;
ACCESS  *domainHead = NULL;
ACCESS  *dateHead = NULL;

long    gCutoff;        /* to avoid passing parameter to recursive routine */
long    userCutoff;     /* minimum # of hits to report in user listing */
long    fileCutoff;     /* minimum # of hits to report in file listing */
long    domainCutoff;   /* minimum # of hits to report in domain listing */

int gFieldLen;          /* to avoid passing this to recursive print proc */
int maxUserLen = 0;     /* maximum length of user name */
int maxFileLen = 0;     /* maximum length of file name */
int maxDomainLen = 0;   /* maximum length of domain name */
int maxDateLen = 0;     /* maximum length of time/date field */

long    totBytes;

BOOLEAN listUsers = FALSE;
BOOLEAN listFiles = FALSE;
BOOLEAN listDomains = FALSE;
BOOLEAN listDates = FALSE;
BOOLEAN noOptions = TRUE;

long        fileLevel = 1;
long        domainLevel = 1;

char        *listName[] = {
                "user", "users",
                "domain", "domains",
                "file", "files",
                "date", "dates"
            };

BOOLEAN dbug = FALSE;

/******************************************************************
newNode
    purpose:
        allocate a new ACCESS structure.
    input:
        name    - the entry into the structure's name field
    returns:
        node created
        NULL if out of memory
*******************************************************************/
ACCESS  *newNode(char *name)
{
    ACCESS  *temp;

    temp = calloc(1, sizeof(ACCESS));
    if (!temp)
    {
        fprintf(stderr,"Out of memory.\n");
        return NULL;
    }
    temp->name = calloc(strlen(name)+1, sizeof(char));
    if (!temp->name)
    {
        free(temp);
        fprintf(stderr, "Out of memory allocating name.\n");
        return NULL;
    }
    strcpy(temp->name, name);
    temp->lTree = temp->rTree = NULL;
    temp->nAccesses = 0;
    return temp;
}

/******************************************************************
search
    purpose:
        searches for the given name in a binary tree.
        If not found, allocates a node and enters the name at the
        proper location.
    input:
        name    - the item to search for
        head    - the root of the tree to search through
    returns:
        node where item was found
        NULL if out of memory
*******************************************************************/
ACCESS  *search(char *name, ACCESS *head)
{
    int     cmp;

    while (head != NULL)
    {
        cmp = strcmp(name, head->name);
        if (cmp == 0)
            break;

        if (cmp < 0)
        {
            if (!head->lTree)
            {
                head->lTree = newNode(name);
                if (!head->lTree)
                    return NULL;
            }
            head = head->lTree;
        }
        else if (cmp > 0)
        {
            if (!head->rTree)
            {
                head->rTree = newNode(name);
                if (!head->rTree)
                    return NULL;
            }
            head = head->rTree;
        }
    }

    return head;
}

/******************************************************************
printAlphaTree
    purpose:
        prints a binary tree in alphabetical order.
    input:
        head    - the root of the tree to print
    returns:
        number of items printed
*******************************************************************/
long printAlphaTree(ACCESS *head)
{
    int     i;
    long        n = 0;

    if (head)
    {
        n += printAlphaTree(head->lTree);
        if (head->nAccesses >= gCutoff)
        {
            i = gFieldLen-strlen(head->name)+1;
            printf("%s %*c %ld\n", head->name, i, ' ', head->nAccesses);
            n++;
        }
        n += printAlphaTree(head->rTree);
    }
    return n;
}

/******************************************************************
countTree
    purpose:
        calculates number of items in a tree.
    input:
        head    - the root of the tree to print
    returns:
        the number of items in the tree
*******************************************************************/
size_t countTree(ACCESS *head)
{
    size_t  n = 0;

    if (head)
    {
        n += countTree(head->lTree);
        n++;
        n += countTree(head->rTree);
    }
    return n;
}

/******************************************************************
treeToArray
    purpose:
        take pointers from binary tree and put them into a sortable
        array.
    input:
        head    - the root of the tree to print
        array - the linear array destination
        n - starting point in the array
    returns:
        ending position in array
*******************************************************************/
size_t treeToArray(ACCESS *head, ACCESS **array, size_t n)
{
    if (head)
    {
        n = treeToArray(head->lTree, array, n);
        array[n++] = head;
        n = treeToArray(head->rTree, array, n);
    }
    return n;
}

/******************************************************************
accessSort
    purpose:
        sort an array of ACCESS * into descending order by number of
        accesses. In case of a tie, sort alpha ascending. Used in call
        to qsort.
    input:
        a - pointer to an ACCESS *
        b - pointer to an ACCESS *
    returns:
        -1 for a with more accesses than b
        0 for a with same # of accesses as b
        1 for a with fewer access than b
*******************************************************************/
int accessSort( const void *a, const void *b)
{
    if ((*((ACCESS **) a))->nAccesses < (*((ACCESS **) b))->nAccesses)
        return 1;
    if ((*((ACCESS **) a))->nAccesses > (*((ACCESS **) b))->nAccesses)
        return -1;
    return( strcmp((*((ACCESS **) a))->name, (*((ACCESS **) b))->name) );
}

/******************************************************************
accessPrint
    purpose:
        sorts a set of ACCESS records by number of accesses and
        prints the results
    input:
        head    - the root of the tree to print
    returns:
        number of lines printed, or
        -1  if not enough memory was available for sort
*******************************************************************/
long accessPrint(ACCESS *head)
{
    ACCESS  **accessArray;
    size_t  i, nItems, nPrint;
    int     nBlanks;

    if (head)
    {
        nPrint = 0;
        nItems = countTree(head);
        accessArray = (ACCESS **) malloc(nItems * sizeof(ACCESS *));
        if (!accessArray)
        {
            fprintf(stderr, "Unable to allocate array for sort.\n");
            return -1;
        }

        treeToArray(head, accessArray, 0);

        qsort((void *)accessArray, nItems, sizeof(ACCESS *), accessSort);

        for (i=0; i<nItems; i++)
        {
            if (accessArray[i]->nAccesses < gCutoff)
                break;
            nBlanks = gFieldLen-strlen(accessArray[i]->name)+1;
            printf("%s %*c %ld\n", accessArray[i]->name, nBlanks, ' ',
                accessArray[i]->nAccesses);
            nPrint++;
        }

        free(accessArray);

        return nPrint;
    }
    else
        return -1;
}

/******************************************************************
deAllocate
    purpose:
        releases memory allocated to a tree
    input:
        head    - the root of the tree to deallocate
*******************************************************************/
void deAllocate(ACCESS *head)
{
    if (head)
    {
        deAllocate(head->lTree);
        deAllocate(head->rTree);
        free(head->name);
        free(head);
    }
}

/******************************************************************
setValue
    purpose:
        set an integer value from the argument list, and
        increment pointer to argument string.
    input:
        p - current pointer to argument
        thingToSet - the value to set
    returns:
        pointer to location past the integer
*******************************************************************/
char *setValue(char *p, long *thingToSet)
{
    int i;

    i = strspn(p, "012345789");
    if (i)
        *thingToSet = atol(p);
    return p + i;
}

/******************************************************************
analyzeOption
    purpose:
        analyze a single "-" option in the command line
    input:
        the argument, extracted from the argv array
    NOTE:
        sets global variables for hit-limit cutoffs and
        levels-to-display. If any valid option is used, then
        "noOptions" is set FALSE, so we will not get
        any "user list" output if we want only "file lists".
*******************************************************************/
void analyzeOption(char *arg)
{
    char    *p;

    p = arg + 1;    /* skip the leading hyphen */
    while (*p)
    {
        if (*p == 'u')  /* can be followed by limit */
        {
            noOptions = FALSE;
            listUsers = TRUE;
            p++;
            p = setValue(p, &userCutoff);
        }
        if (*p == 'f')  /* can be followed by limit and level */
        {
            noOptions = FALSE;
            listFiles = TRUE;
            p++;
            p = setValue(p, &fileCutoff);
            if (*p == ',')
            {
                p++;
                p = setValue(p, &fileLevel);
            }
        }
        else if (*p == 'd') /* can be followed by limit and # of levels */
        {
            noOptions = FALSE;
            listDomains = TRUE;
            p++;
            p = setValue(p, &domainCutoff);
            if (*p == ',')
            {
                p++;
                p = setValue(p, &domainLevel);
            }
        }
        else if (*p == 't')
        {
            listDates = TRUE;
            p++;
        }
    }
}

/******************************************************************
analyzeArguments
    purpose:
        check validity of argument list.
    input:
        argc    - number of arguments on command line
        argv  - array of arguments
    returns:
        TRUE    if argument list seems all right
        FALSE if more than one argument begins with a non-dash
                or does not name a valid file
*******************************************************************/
BOOLEAN analyzeArguments(int argc, char *argv[])
{
    int i;
    for (i=1; i<argc; i++)
    {
        if (*argv[i] == '-')
            analyzeOption(argv[i]);
        else if (inFile)
            fprintf(stderr, "Please specify only one file.\n");
        else
        {
            inFile = fopen(argv[i], "r");
            if (!inFile)
            {
                fprintf(stderr, "Unable to open file %s\n", argv[i]);
                return FALSE;
            }
        }
    }
    return TRUE;
}

/******************************************************************
printFooter
    purpose:
        prints the total number of items output by accessPrint
        or printAlphaTable.
    input:
        nPrint - number of lines printed (comes from the print routine)
        listType - which list was printed
*******************************************************************/
void printFooter(long nPrint, int listType)
{
    if (nPrint == 1)
        printf("1 %s\n", listName[listType*2]);
    else
        printf("%ld %s\n", nPrint, listName[listType*2+1]);
}


void main(int argc, char *argv[])
{
    char        oneLine[512];
    char        *p;
    char        *pUser, *pDomain, *pFile, *pDate;
    ACCESS      *theUser, *theDomain, *theFile, *theDate;
    long        nLines, totLines;
    int         i;
    long        nPrint;

    if (!analyzeArguments(argc, argv))
    {
        printf("Improper arguments\n");
        exit(EXIT_FAILURE);
    }

    if (!inFile)
        inFile = stdin;

    totLines = nLines = 0;
    totBytes = 0;

    do
    {
        *oneLine = '\0';
        p = fgets(oneLine, 511, inFile);
        if (!p) /* we've read everything - exit now */
        {
            totLines = nLines;
            break;
        }

        nLines++;

        /* now split line into user, file, domain, and time info */
        pUser = strtok(oneLine, " ");   /* user name */
        p =     strtok(NULL, " ");      /* IP or file */
        p =     strtok(NULL, " ");      /* - separator */
        pDate = strtok(NULL, " ");      /* date and time */
        p =     strtok(NULL, " ");      /* the GMT offset */
        p =     strtok(NULL, " ");      /* "GET or whatever */
        pFile = strtok(NULL, " ");      /* the actual file name */
        p =     strtok(NULL, " ");      /* HTTP method */
        p =     strtok(NULL, " ");      /* error code */
        p =     strtok(NULL, " ");      /* # of bytes */
        totBytes += atol(p);

        pDate++;
        p = strchr(pDate, ':');      /* cut off hour/min/sec */
        if (p)
            *p = '\0';

        /* back up to find the levels of domain we want */
        pDomain = pUser + strlen(pUser) - 1;
        i = 0;
        while (pDomain > pUser)
        {
            if (*pDomain == '.')
            {
                i++;
                if (i == domainLevel)
                {
                    pDomain++;
                    break;
                }
            }
            pDomain--;
        }

        /* back off and find the tail end of the file name
            to desired number of levels */
        p = pFile + strlen(pFile) - 1;
        i = 0;
        while (p > pFile)
        {
            if (*p == '/')
            {
                i++;
                if (i == fileLevel)
                {
                    p++;
                    break;
                }
            }
            p--;
        }
        pFile = p;

        /* construct the binary trees */
      if (*pUser)
        {
            if (!userHead)
                theUser = userHead = newNode(pUser);
            else
                theUser = search(pUser, userHead);
            if (!theUser)
                break;

            theUser->nAccesses++;
            maxUserLen = max(strlen(pUser), maxUserLen);
        }

        if (*pDomain)
        {
            if (strspn(pDomain, "0123456789.") == strlen(pDomain))
                pDomain = ANON_IP;

            if (!domainHead)
                theDomain = domainHead = newNode(pDomain);
            else
                theDomain = search(pDomain, domainHead);

            if (!theDomain)
                break;

            theDomain->nAccesses++;
            maxDomainLen = max(strlen(pDomain), maxDomainLen);
        }

        if (*pFile)
        {
            if (!fileHead)
                theFile = fileHead = newNode(pFile);
            else
                theFile = search(pFile, fileHead);
            if (!theFile)
                break;

            theFile->nAccesses++;
            maxFileLen = max(strlen(pFile), maxFileLen);
        }

        if (*pDate)
        {
            if (!dateHead)
                theDate = dateHead = newNode(pDate);
            else
                theDate = search(pDate, dateHead);
            if (!theDate)
                break;

            theDate->nAccesses++;
            maxDateLen = max(strlen(pDate), maxDateLen);
        }

    } while (TRUE);

    /* if we had a memory error, read through the entire file */
    if (totLines != nLines)
    {
        totLines = nLines;
        while (p = fgets(oneLine, 511, inFile))
            totLines++;
    }

    fclose(inFile);
    printf("Lines processed: %ld of %ld\n", nLines, totLines);
    printf("\n");
    if (listUsers)
    {
        printf("Access by User:\n");
        gFieldLen = maxUserLen;
        gCutoff = userCutoff;
        nPrint = accessPrint(userHead);
        if (nPrint < 0)
            nPrint = printAlphaTree(userHead);
        printFooter(nPrint, 0);
        printf("\n");
    }
    printf("Total bytes: %ld\n\n", totBytes);

    if (noOptions)
    {
        printf("Total number of users: %u\n", countTree(userHead));
    }
    deAllocate(userHead);

    if (listDomains)
    {
        printf("\nAccess by Domain:\n");
        gFieldLen = maxDomainLen;
        gCutoff = domainCutoff;
        if ((nPrint = accessPrint(domainHead)) < 0)
            nPrint = printAlphaTree(domainHead);
        printFooter(nPrint, 1);
        printf("\n");
    }
    if (noOptions)
        printf("Total number of domains: %u\n", countTree(domainHead));
    deAllocate(domainHead);

    if (listFiles)
    {
        printf("\nAccess by File:\n");
        gFieldLen = maxFileLen;
        gCutoff = fileCutoff;
        if ((nPrint = accessPrint(fileHead)) < 0)
            nPrint = printAlphaTree(fileHead);
        printFooter(nPrint, 2);
        printf("\n");
    }
    if (noOptions)
        printf("Total number of files: %u\n", countTree(fileHead));
    deAllocate(fileHead);

    if (listDates)
    {
        printf("\nAccess by Date:\n");
        gFieldLen = maxDateLen;
        gCutoff = 1;
        if ((nPrint = accessPrint(dateHead)) < 0)
            nPrint = printAlphaTree(dateHead);
        printFooter(nPrint, 3);
    }
    if (noOptions)
        printf("Total number of dates: %u\n", countTree(dateHead));
    deAllocate(dateHead);

}

