Speeding up Multiple Instance Learning Classification Rules on GPUs

From KDIS Research Group - Wiki

Jump to: navigation, search

This website contains additional material to the paper titled Speeding up Multiple Instance Learning Classification Rules on GPUs published in Knowledge and Information Systems.

[edit] Abstract

Multiple instance learning is a challenging task in supervised learning and data mining. However, algorithm performance becomes slow when learning from large-scale and high-dimensional data sets. Graphics processing units (GPUs) are being used for reducing computing time of algorithms. This paper presents an implementation of the G3P-MI algorithm on GPUs for solving multiple instance problems using classification rules. The GPU model proposed is distributable to multiple GPUs, seeking for its scalability across large-scale and high-dimensional data sets. The proposal is compared to the multi-threaded CPU algorithm with SSE parallelism over a series of data sets. Experimental results report that the computation time can be significantly reduced and its scalability improved. Specifically, an speedup of up to 149x can be achieved over the multi-threaded CPU algorithm when using four GPUs, and the rules interpreter achieves great efficiency and runs over 108 billion Genetic Programming operations per second.

[edit] Data sets

The data sets were collected from the UCI repository website. However, these real-world data sets may be categorized as medium size. In order to evaluate larger data, we generated 7 artificial data sets which comprise a wide dimensionality range, containing from 100 to 1 million instances. Download data sets

Data set Attributes Bags Instances
Component 201 3,130 36,894
EastWest 25 20 213
Elephant 231 200 1,391
Fox 231 200 1,320
Function 201 5,242 55,536
Musk1 167 92 476
Musk2 167 102 6,598
Mutagenesis-atoms 11 188 1,618
Mutagenesis-bonds 17 188 3,995
Mutagenesis-chains 25 188 5,349
Process 201 11,718 118,417
Suramin 21 11 2,378
Tiger 231 200 1,220
Artificial1 100 10 1,000
Artificial2 100 100 1,000
Artificial3 100 100 10,000
Artificial4 100 1,000 10,000
Artificial5 100 1,000 100,000
Artificial6 100 10,000 100,000
Artificial7 100 100,000 1,000,000

[edit] GPU kernels

Coverage kernel

 __global__ void coverageKernel(float* rules, unsigned char* coverarray, int numberInstances) 
 {
   int instance = blockDim.y * blockIdx.y + threadIdx.y;
   coverarray[blockIdx.x * numberInstances + instance] = covers(&rules[blockIdx.x], instance);
 }
 
 __device__ unsigned char covers(float* rule, int instance)
 {
   ...
   for(int ptr = 0; ptr < ruleLength; ) 
   {
     switch(rule[ptr]) {
     ...
     case GREATER:
       attribute = expr[prt+1];
       op1 = instancesData[instance + numberInstances * attribute];
       op2 = expr[prt+2];
       if (op1 > op2)       push(1, stack);
       else                 push(0, stack);
       ptr += 3;
       break;
     ...
     case AND:
       op1 = pop(stack);
       op2 = pop(stack);
       if (op1 * op2 == 1)  push(1, stack);
       else                 push(0, stack);
       ptr++;
       break;
     ...
     }
   }
 
   return (unsigned char) pop(stack);
 }

Hypothesis kernel

 __global__ void presenceHypothesis(unsigned char* coverarray, int* bagPrediction, int numInstances,
                                    int numBags, int* bag) {                            
   int instance = blockDim.y * blockIdx.y + threadIdx.y;
   
   if(coverarray[blockIdx.x * numInstances + instance] == 1)
      bagPrediction[blockIdx.x * numBags + bag[instance]] = 1;
 }
 
 __global__ void thresholdHypothesis(unsigned char* coverarray, int* bagPrediction, int numInstances
                                     int numBags, int minimumCount, int* firstInstanceBag,
                                     int* lastInstanceBag) {
   int bag = blockDim.y * blockIdx.y + threadIdx.y;
   int begin = firstInstanceBag[bag], end = lastInstanceBag[bag];
   int coverCount = 0;
       
   for(int i = begin; i < end; i++)
     if(coverarray[blockIdx.x * numInstances + i] == 1)
        coverCount++;
       
   if(coverCount >= minimumCount)
      bagPrediction[blockIdx.x * numBags + bag] = 1;
 }
 
 __global__ void countHypothesis(unsigned char* coverarray, int* bagPrediction, int numInstances,
                                 int numBags, int minimumCount, int maximumCount,
                                 int* firstInstanceBag, int* lastInstanceBag) {
   int bag = blockDim.y * blockIdx.y + threadIdx.y;
   int begin = firstInstanceBag[bag], end = lastInstanceBag[bag];
   int coverCount = 0;
       
   for(int i = begin; i < end; i++)
     if(coverarray[blockIdx.x * numInstances + i] == 1)
        coverCount++;
       
   if(coverCount >= minimumCount && coverCount <= maximumCount)
      bagPrediction[blockIdx.x * numBags + bag] = 1;
 }

Fitness kernel

 __global__ void fitnessKernel(float* fitness, int* bagPrediction, int* bagClass, int numBags) {
   __shared__ int MC[512];
   int base = blockIdx.x * numBags + threadIdx.y;
   int top =  blockIdx.x * numBags + numBags - base;
      
   MC[threadIdx.y] = MC[threadIdx.y+128] = MC[threadIdx.y+256] = MC[threadIdx.y+384] = 0;
         
   // Performs the reduction of the confusion matrix values
   for(int i = 0; i < top; i+=128) {
     if(bagClass[threadIdx.y + i] == 1 && bagPrediction[base + i] == 1)
       MC[threadIdx.y*4]++;       // True positive
     else if(bagClass[threadIdx.y + i] == 0 && bagPrediction[base + i] == 1)
       MC[threadIdx.y*4 + 2]++;   // False positive
     else if(bagClass[threadIdx.y + i] == 1 && bagPrediction[base + i] == 0)
       MC[threadIdx.y*4 + 3]++;   // False Negative
     else
       MC[threadIdx.y*4 + 1]++;   // True negative
   }
       
   __syncthreads();
       
   if(threadIdx.y < 4) {
     for(int i = 4; i < 512; i+=4) {
       MC[0] += MC[threadIdx.y];     // Number of true positives
       MC[1] += MC[threadIdx.y+1];   // Number of true negatives
       MC[2] += MC[threadIdx.y+2];   // Number of false positives
       MC[3] += MC[threadIdx.y+3];   // Number of false negatives            
     }
     
     if(threadIdx.y == 0) { // Perform final reduction and fitness computation
       float sensitivity, specificity;
       int tp = MC[0], tn = MC[1], fp = MC[2], fn = MC[3];
          
       sensitivity = tp / (float) (tp + fn);
       specificity = tn / (float) (tn + fp);
 
       // Fitness function
       fitness[blockIdx.x] = sensitivity * specificity;
     }
 }
Personal tools