00001 
00013 #include <GKlib.h>
00014 
00015 
00017 
00018 typedef struct {
00019   int minfreq;  
00020   int maxfreq;  
00021   int minlen;   
00022   int maxlen;   
00023   int tnitems;  
00024 
00025   
00026   void (*callback)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids); 
00027   void *stateptr;   
00028 
00029   
00030   int *rmarker;
00031   gk_ikv_t *cand;
00032 } isparams_t;
00033 
00034 
00035 
00037 
00038 void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat, 
00039          int preflen, int *prefix);
00040 gk_csr_t *itemsets_project_matrix(isparams_t *param, gk_csr_t *mat, int cid);
00041 
00042 
00043 
00044 
00046 
00047 void gk_find_frequent_itemsets(int ntrans, ssize_t *tranptr, int *tranind, 
00048         int minfreq, int maxfreq, int minlen, int maxlen, 
00049         void (*process_itemset)(void *stateptr, int nitems, int *itemids, 
00050                                 int ntrans, int *transids),
00051         void *stateptr)
00052 {
00053   ssize_t i;
00054   gk_csr_t *mat, *pmat;
00055   isparams_t params;
00056   int *pattern;
00057 
00058   
00059   mat = gk_csr_Create();
00060   mat->nrows  = ntrans;
00061   mat->ncols  = tranind[gk_iargmax(tranptr[ntrans], tranind)]+1;
00062   mat->rowptr = gk_zcopy(ntrans+1, tranptr, gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr"));
00063   mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind"));
00064   mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids"));
00065 
00066   
00067   params.minfreq  = minfreq;
00068   params.maxfreq  = (maxfreq == -1 ? mat->nrows : maxfreq);
00069   params.minlen   = minlen;
00070   params.maxlen   = (maxlen == -1 ? mat->ncols : maxlen);
00071   params.tnitems  = mat->ncols;
00072   params.callback = process_itemset;
00073   params.stateptr = stateptr;
00074   params.rmarker  = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker");
00075   params.cand     = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand");
00076 
00077   
00078   gk_csr_CreateIndex(mat, GK_CSR_COL);
00079   pmat = itemsets_project_matrix(¶ms, mat, -1);
00080   gk_csr_Free(&mat);
00081 
00082   pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern");
00083   itemsets_find_frequent_itemsets(¶ms, pmat, 0, pattern); 
00084 
00085   gk_csr_Free(&pmat);
00086   gk_free((void **)&pattern, ¶ms.rmarker, ¶ms.cand, LTERM);
00087 
00088 }
00089 
00090 
00091 
00092 
00094 
00095 void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat, 
00096          int preflen, int *prefix)
00097 {
00098   ssize_t i;
00099   gk_csr_t *cmat;
00100 
00101   
00102   for (i=0; i<mat->ncols; i++) {
00103     prefix[preflen] = mat->colids[i];
00104 
00105     if (preflen+1 >= params->minlen)
00106       (*params->callback)(params->stateptr, preflen+1, prefix, 
00107            mat->colptr[i+1]-mat->colptr[i], mat->colind+mat->colptr[i]);
00108 
00109     if (preflen+1 < params->maxlen) {
00110       cmat = itemsets_project_matrix(params, mat, i);
00111       itemsets_find_frequent_itemsets(params, cmat, preflen+1, prefix);
00112       gk_csr_Free(&cmat);
00113     }
00114   }
00115 
00116 }
00117 
00118 
00119 
00127 
00128 gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid)
00129 {
00130   ssize_t i, j, k, ii, pnnz;
00131   int nrows, ncols, pnrows, pncols;
00132   ssize_t *colptr, *pcolptr;
00133   int *colind, *colids, *pcolind, *pcolids, *rmarker;
00134   gk_csr_t *pmat;
00135   gk_ikv_t *cand;
00136 
00137   nrows  = mat->nrows;
00138   ncols  = mat->ncols;
00139   colptr = mat->colptr;
00140   colind = mat->colind;
00141   colids = mat->colids;
00142 
00143   rmarker = params->rmarker;
00144   cand    = params->cand;
00145 
00146 
00147   
00148   pmat = gk_csr_Create();
00149   pmat->nrows  = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]);
00150 
00151 
00152   
00153   if (cid == -1) { 
00154     gk_iset(nrows, 1, rmarker);
00155   }
00156   else { 
00157     for (i=colptr[cid]; i<colptr[cid+1]; i++) 
00158       rmarker[colind[i]] = 1;
00159   }
00160 
00161 
00162   
00163   for (pncols=0, pnnz=0, i=cid+1; i<ncols; i++) {
00164     for (k=0, j=colptr[i]; j<colptr[i+1]; j++) {
00165       k += rmarker[colind[j]];
00166     }
00167     if (k >= params->minfreq && k <= params->maxfreq) {
00168       cand[pncols].val   = i;
00169       cand[pncols++].key = k;
00170       pnnz += k;
00171     }
00172   }
00173 
00174   
00175   gk_ikvsorti(pncols, cand);
00176 
00177 
00178   
00179   pmat->ncols  = pncols;
00180   pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids");
00181   pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr");
00182   pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind");
00183 
00184 
00185   
00186   pcolptr[0] = 0;
00187   for (pnnz=0, ii=0; ii<pncols; ii++) {
00188     i = cand[ii].val;
00189     for (j=colptr[i]; j<colptr[i+1]; j++) {
00190       if (rmarker[colind[j]]) 
00191         pcolind[pnnz++] = colind[j];
00192     }
00193 
00194     pcolids[ii] = colids[i];
00195     pcolptr[ii+1] = pnnz;
00196   }
00197 
00198 
00199   
00200   if (cid == -1) { 
00201     gk_iset(nrows, 0, rmarker);
00202   }
00203   else { 
00204     for (i=colptr[cid]; i<colptr[cid+1]; i++) 
00205       rmarker[colind[i]] = 0;
00206   }
00207 
00208 
00209   return pmat;
00210 }