C45Tree.cc
Go to the documentation of this file.
00001 
00008 #include "C45Tree.hh"
00009 
00010 
00011 
00012 C45Tree::C45Tree(int id, int trainMode, int trainFreq, int m,
00013                  float featPct, Random rng):
00014   id(id), mode(trainMode), freq(trainFreq), M(m),
00015   featPct(featPct), ALLOW_ONLY_SPLITS(true), rng(rng)
00016 {
00017 
00018   nnodes = 0;
00019   nOutput = 0;
00020   nExperiences = 0;
00021   hadError = false;
00022   maxnodes = N_C45_NODES;
00023   totalnodes = 0;
00024 
00025   // how close a split has to be to be randomly selected
00026   SPLIT_MARGIN = 0.0; //0.02; //5; //01; //0.05; //0.2; //0.05;
00027 
00028   MIN_GAIN_RATIO = 0.0001; //0.0004; //0.001; //0.0002; //0.001;
00029 
00030   DTDEBUG = false; //true;
00031   SPLITDEBUG = false; //true;
00032   STOCH_DEBUG = false; //true; //false; //true;
00033   INCDEBUG = false; //true; //false; //true;
00034   NODEDEBUG = false;
00035   COPYDEBUG = false;
00036 
00037   cout << "Created C4.5 decision tree " << id;
00038   if (ALLOW_ONLY_SPLITS) cout << " with == and > splits" << endl;
00039   else cout << " with > splits only" << endl;
00040   if (DTDEBUG) {
00041     cout << " mode: " << mode << " freq: " << freq << endl;
00042   }
00043 
00044   initNodes();
00045   initTree();
00046 
00047 }
00048 
00049 C45Tree::C45Tree(const C45Tree &t):
00050   id(t.id), mode(t.mode), freq(t.freq), M(t.M),
00051   featPct(t.featPct), ALLOW_ONLY_SPLITS(t.ALLOW_ONLY_SPLITS), rng(t.rng)
00052 {
00053   COPYDEBUG = t.COPYDEBUG;
00054   if (COPYDEBUG) cout << "  C4.5 tree copy constructor id " << id << endl;
00055   nnodes = 0;
00056   nOutput = t.nOutput;
00057   nExperiences = t.nExperiences;
00058   hadError = t.hadError;
00059   maxnodes = t.maxnodes;
00060   totalnodes = 0;
00061 
00062   SPLIT_MARGIN = t.SPLIT_MARGIN;
00063   MIN_GAIN_RATIO = t.MIN_GAIN_RATIO;
00064   DTDEBUG = t.DTDEBUG;
00065   SPLITDEBUG = t.SPLITDEBUG;
00066   STOCH_DEBUG = t.STOCH_DEBUG;
00067   INCDEBUG = t.INCDEBUG;
00068   NODEDEBUG = t.NODEDEBUG;
00069   
00070   
00071   if (COPYDEBUG) cout << "   C4.5 copy nodes, experiences, root, etc" << endl;
00072   // copy all experiences
00073   for (int i = 0; i < N_C45_EXP; i++){
00074     allExp[i] = t.allExp[i];
00075   }
00076   if (COPYDEBUG) cout << "   C4.5 copied exp array" << endl;
00077 
00078   // set experience pointers
00079   experiences.resize(t.experiences.size());
00080   for (unsigned i = 0; i < t.experiences.size(); i++){
00081     experiences[i] = &(allExp[i]);
00082   }
00083   if (COPYDEBUG) cout << "   C4.5 set experience pointers" << endl;
00084 
00085   // now the tricky part, set the pointers inside the tree nodes correctly
00086   initNodes();
00087 
00088   if (COPYDEBUG) cout << "   C4.5 copy tree " << endl;
00089   root = allocateNode();
00090   lastNode = root;
00091   copyTree(root, t.root);
00092   if (COPYDEBUG) cout << "   C4.5 tree copy done" << endl;
00093    
00094   if (COPYDEBUG) {
00095     cout << endl << "New tree: " << endl;
00096     printTree(root, 0);
00097     cout << endl;
00098     cout << "  c4.5 copy done" << endl;
00099   }
00100 
00101 }
00102 
00103 C45Tree* C45Tree::getCopy(){
00104   
00105   C45Tree* copy = new C45Tree(*this);
00106   return copy;
00107 
00108 }
00109 
00110 void C45Tree::copyTree(tree_node* newNode, tree_node* origNode){
00111 
00112   int nodeId = newNode->id;
00113 
00114   if (COPYDEBUG) {
00115     cout << "    Copy node " << newNode->id << " from node " << origNode->id << endl;
00116     cout << "    NewAddy " << newNode << ", old: " << origNode << endl;
00117   }
00118 
00119   // copy node from t
00120   *newNode = *origNode;
00121   newNode->id = nodeId;
00122 
00123   // if it has children, allocate and copy them too
00124   if (origNode->l != NULL && !newNode->leaf){
00125     newNode->l = allocateNode();
00126     if (COPYDEBUG) cout << "     Copy left node " << newNode->l->id << " from " << origNode->l->id << endl;
00127     copyTree(newNode->l, origNode->l);
00128   } else {
00129     newNode->l = NULL;
00130   }
00131 
00132   if (origNode->r != NULL && !newNode->leaf){
00133     newNode->r = allocateNode();
00134     if (COPYDEBUG) cout << "     Copy right node " << newNode->r->id << " from " << origNode->r->id << endl;
00135     copyTree(newNode->r, origNode->r);
00136   } else {
00137     newNode->r = NULL;
00138   }
00139 }
00140 
00141 
00142 C45Tree::~C45Tree() {
00143   deleteTree(root);
00144   for (unsigned i = N_C45_EXP; i < experiences.size(); i++){
00145     delete experiences[i];
00146   }
00147   experiences.clear();
00148 }
00149 
00150 // here the target output will be a single value
00151 bool C45Tree::trainInstance(classPair &instance){
00152 
00153   if (DTDEBUG) cout << "trainInstance" << endl;
00154 
00155   bool modelChanged = false;
00156 
00157   // simply add this instance to the set of experiences
00158 
00159   // take from static array until we run out
00160   tree_experience *e;
00161   if (nExperiences < N_C45_EXP){
00162     // from statically created set of experiences
00163     e = &(allExp[nExperiences]);
00164 
00165   } else {
00166     // dynamically create experience
00167     e = new tree_experience;
00168   }
00169 
00170 
00171   e->input = instance.in;
00172   e->output = instance.out;
00173   experiences.push_back(e);
00174   nExperiences++;
00175 
00176   if (nExperiences == 1000000){
00177     cout << "Reached limit of # experiences allowed." << endl;
00178     return false;
00179   }
00180 
00181   if (nExperiences != (int)experiences.size())
00182     cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00183 
00184   //cout << nExperiences << endl << flush;
00185   //if (nExperiences == 503 && id == 10){
00186   //  DTDEBUG = true;
00187   //  SPLITDEBUG = true;
00188   //  INCDEBUG = true;
00189   //}
00190 
00191   if (DTDEBUG) {
00192     cout << "Original input: ";
00193     for (unsigned i = 0; i < instance.in.size(); i++){
00194       cout << instance.in[i] << ", ";
00195     }
00196     cout << endl << " Original output: " << instance.out << endl;
00197     cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00198     cout << "Address: " << e << " Input : ";
00199     for (unsigned i = 0; i < e->input.size(); i++){
00200       cout << e->input[i] << ", ";
00201     }
00202     cout << endl << " Now have " << nExperiences << " experiences." << endl;
00203   }
00204 
00205   // depending on mode/etc, maybe re-build tree
00206 
00207   // mode 0: re-build every step
00208   if (mode == BUILD_EVERY || nExperiences <= 1){
00209     modelChanged = rebuildTree();
00210   }
00211 
00212   // mode 1: re-build on error only
00213   else if (mode == BUILD_ON_ERROR){
00214 
00215     // build on misclassification
00216     // check for misclassify
00217     // get leaf
00218     tree_node* leaf = traverseTree(root, e->input);
00219     // find probability for this output
00220     float count = (float)leaf->outputs[e->output];
00221     float outputProb = count / (float)leaf->nInstances;
00222 
00223     if (outputProb < 0.75){
00224       modelChanged = rebuildTree();
00225       modelChanged = true;
00226     }
00227   }
00228 
00229   // mode 2: re-build every FREQ steps
00230   else if (mode == BUILD_EVERY_N){
00231     // build every freq steps
00232     if (!modelChanged && (nExperiences % freq) == 0){
00233       modelChanged = rebuildTree();
00234     }
00235   }
00236 
00237   if (modelChanged){
00238     if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00239 
00240     if (DTDEBUG){
00241       cout << endl << "DT: " << id << endl;
00242       printTree(root, 0);
00243       cout << "Done printing tree" << endl;
00244     }
00245   }
00246 
00247   /*
00248   if (nExperiences % 50 == 0){
00249     cout << endl << "DT: " << id << endl;
00250     printTree(root, 0);
00251     cout << "Done printing tree" << endl;
00252   }
00253   */
00254 
00255   return modelChanged;
00256 
00257 }
00258 
00259 
00260 // here the target output will be a single value
00261 bool C45Tree::trainInstances(std::vector<classPair> &instances){
00262   if (DTDEBUG) cout << "DT " << id << "  trainInstances: " << instances.size() << endl;
00263 
00264   bool modelChanged = false;
00265 
00266   bool doBuild = false;
00267 
00268   bool buildOnError = false;
00269 
00270   // loop through instances, possibly checking for errors
00271   for (unsigned a = 0; a < instances.size(); a++){
00272     classPair instance = instances[a];
00273 
00274     // simply add this instance to the set of experiences
00275 
00276     // take from static array until we run out
00277     tree_experience *e;
00278     if (nExperiences < N_C45_EXP){
00279       // from statically created set of experiences
00280       e = &(allExp[nExperiences]);
00281 
00282     } else {
00283       // dynamically create experience
00284       e = new tree_experience;
00285     }
00286 
00287 
00288     e->input = instance.in;
00289     e->output = instance.out;
00290     experiences.push_back(e);
00291     nExperiences++;
00292 
00293     if (nExperiences == 1000000){
00294       cout << "Reached limit of # experiences allowed." << endl;
00295       return false;
00296     }
00297 
00298     if (nExperiences != (int)experiences.size())
00299       cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00300 
00301     if (DTDEBUG) {
00302       cout << "Original input: ";
00303       for (unsigned i = 0; i < instance.in.size(); i++){
00304         cout << instance.in[i] << ", ";
00305       }
00306       cout << endl << " Original output: " << instance.out << endl;
00307       cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00308       cout << "Address: " << e << " Input : ";
00309       for (unsigned i = 0; i < e->input.size(); i++){
00310         cout << e->input[i] << ", ";
00311       }
00312       cout << endl << " Now have " << nExperiences << " experiences." << endl;
00313     }
00314 
00315     // depending on mode/etc, maybe re-build tree
00316 
00317     // don't need to check if we've already decided
00318     if (doBuild) continue;
00319 
00320     // mode 0: re-build every step
00321     if (mode == BUILD_EVERY || nExperiences <= 1){
00322       doBuild = true;
00323     }
00324 
00325     // mode 1: re-build on error only
00326     else if (mode == BUILD_ON_ERROR){
00327 
00328       // build on misclassification
00329       // check for misclassify
00330       // get leaf
00331       tree_node* leaf = traverseTree(root, e->input);
00332       // find probability for this output
00333       float count = (float)leaf->outputs[e->output];
00334       float outputProb = count / (float)leaf->nInstances;
00335 
00336       if (outputProb < 0.75){
00337         doBuild = true;
00338         buildOnError = true;
00339       }
00340     }
00341 
00342     // mode 2: re-build every FREQ steps
00343     else if (mode == BUILD_EVERY_N){
00344       // build every freq steps
00345       if (!modelChanged && (nExperiences % freq) == 0){
00346         doBuild = true;
00347       }
00348     }
00349 
00350   } // loop of new instances
00351 
00352   if (DTDEBUG) cout << "Added " << instances.size() << " new instances. doBuild = " << doBuild << endl;
00353 
00354   if (doBuild){
00355     modelChanged = rebuildTree();
00356   }
00357 
00358   if (modelChanged){
00359     if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00360 
00361     if (DTDEBUG){
00362       cout << endl << "DT: " << id << endl;
00363       printTree(root, 0);
00364       cout << "Done printing tree" << endl;
00365     }
00366   }
00367 
00368   return (modelChanged || buildOnError);
00369 
00370 }
00371 
00372 
00373 bool C45Tree::rebuildTree(){
00374   return buildTree(root, experiences, false);
00375 }
00376 
00377 
00378 // TODO: here we want to return the probability of the output value being each of the possible values, in the stochastic case
00379 void C45Tree::testInstance(const std::vector<float> &input, std::map<float, float>* retval){
00380   if (DTDEBUG) cout << "testInstance" << endl;
00381 
00382   retval->clear();
00383 
00384   // in case the tree is empty
00385   if (experiences.size() == 0){
00386     (*retval)[0.0] = 1.0;
00387     return;
00388   }
00389 
00390   // follow through tree to leaf
00391   tree_node* leaf = traverseTree(root, input);
00392   lastNode = leaf;
00393 
00394   // and return mapping of outputs and their probabilities
00395   outputProbabilities(leaf, retval);
00396 
00397 }
00398 
00399 float C45Tree::getConf(const std::vector<float> &input){
00400   if (DTDEBUG) cout << "numVisits" << endl;
00401 
00402   // in case the tree is empty
00403   if (experiences.size() == 0){
00404     return 0;
00405   }
00406 
00407   // follow through tree to leaf
00408   //tree_node* leaf = traverseTree(root, input);
00409 
00410   // and return # in this leaf
00411   float conf = (float)lastNode->nInstances / (float)(2.0*M);
00412   if (conf > 1.0)
00413     return 1.0;
00414   else
00415     return conf;
00416 
00417 }
00418 
00419 // check to see if this state is one we should explore
00420 // to get more info on potential splits
00421 
00422 
00423 // init the tree
00424 void C45Tree::initTree(){
00425   if (DTDEBUG) cout << "initTree()" << endl;
00426   root = allocateNode();
00427 
00428   if (DTDEBUG) cout << "   root id = " << root->id << endl;
00429 
00430   // just to ensure the diff models are on different random values
00431   for (int i = 0; i < id; i++){
00432     rng.uniform(0, 1);
00433   }
00434 
00435 }
00436 
00437 
00438 
00439 // init a tree node
00440 void C45Tree::initTreeNode(tree_node* node){
00441   if (DTDEBUG) cout << "initTreeNode()";
00442 
00443   node->id = nnodes++;
00444   if (DTDEBUG) cout << " id = " << node->id << endl;
00445 
00446   totalnodes++;
00447   if (totalnodes > maxnodes){
00448     maxnodes = totalnodes;
00449     if (DTDEBUG) cout << id << " C4.5 MAX nodes: " << maxnodes << endl;
00450   }
00451 
00452   // split criterion
00453   node->dim = -1;
00454   node->val = -1;
00455   node->type = -1;
00456 
00457   // current data
00458   node->nInstances = 0;
00459   node->outputs.clear();
00460 
00461   // next nodes in tree
00462   node->l = NULL;
00463   node->r = NULL;
00464 
00465   node->leaf = true;
00466 
00467 }
00468 
00469 void C45Tree::deleteTree(tree_node* node){
00470   if (DTDEBUG) cout << "deleteTree, node=" << node->id << endl;
00471 
00472   if (node==NULL)
00473     return;
00474 
00475   totalnodes--;
00476 
00477   node->nInstances = 0;
00478   node->outputs.clear();
00479 
00480   //recursively call deleteTree on children
00481   // then delete them
00482   if (!node->leaf){
00483     // left child
00484     if (node->l != NULL){
00485       deleteTree(node->l);
00486       deallocateNode(node->l);
00487       node->l = NULL;
00488     }
00489 
00490     // right child
00491     if (node->r != NULL){
00492       deleteTree(node->r);
00493       deallocateNode(node->r);
00494       node->r = NULL;
00495     }
00496   }
00497 
00498   node->leaf = true;
00499   node->dim = -1;
00500 
00501 }
00502 
00503 
00504 C45Tree::tree_node* C45Tree::getCorrectChild(tree_node* node,
00505                                              const std::vector<float> &input){
00506 
00507   if (DTDEBUG) cout << "getCorrectChild, node=" << node->id << endl;
00508 
00509   if (passTest(node->dim, node->val, node->type, input))
00510     return node->l;
00511   else
00512     return node->r;
00513 
00514 }
00515 
00516 C45Tree::tree_node* C45Tree::traverseTree(tree_node* node,
00517                                           const std::vector<float> &input){
00518 
00519   if (DTDEBUG) cout << "traverseTree, node=" << node->id << endl;
00520 
00521   while (!node->leaf){
00522     node = getCorrectChild(node, input);
00523   }
00524 
00525   return node;
00526 }
00527 
00528 
00529 bool C45Tree::passTest(int dim, float val, bool type, const std::vector<float> &input){
00530   if (DTDEBUG) cout << "passTest, dim=" << dim << ",val=" << val << ",type=" << type
00531                     << ",input["<<dim<<"]=" << input[dim] <<endl;
00532 
00533   if (type == CUT){
00534     if (input[dim] > val)
00535       return false;
00536     else
00537       return true;
00538   } else if (type == ONLY){
00539     if (input[dim] == val)
00540       return false;
00541     else
00542       return true;
00543   } else {
00544     return false;
00545   }
00546 
00547 }
00548 
00549 
00550 bool C45Tree::buildTree(tree_node *node,
00551                         const std::vector<tree_experience*> &instances,
00552                         bool changed){
00553   if(DTDEBUG) cout << "buildTree, node=" << node->id
00554                    << ",nInstances:" << instances.size() 
00555                    << ",chg:" << changed << endl;
00556 
00557   if (instances.size() == 0){
00558     cout << "Error: buildTree called on tree " << id << " node " << node->id << " with no instances." << endl;
00559     exit(-1);
00560   }
00561 
00562 
00563   // TODO: what about stochastic data?
00564   //std::vector<float> chiSquare = calcChiSquare(instances);
00565 
00566   // first, add instances to tree
00567   node->nInstances = instances.size();
00568 
00569   // add each output to this node
00570   node->outputs.clear();
00571   for (unsigned i = 0; i < instances.size(); i++){
00572     node->outputs[instances[i]->output]++;
00573   }
00574 
00575   // see if they're all the same
00576   if (node->outputs.size() == 1){
00577     bool change = makeLeaf(node);
00578     if (DTDEBUG){
00579       cout << "All " << node->nInstances
00580            << " classified with output "
00581            << instances[0]->output << endl;
00582     }
00583     return change;
00584   }
00585 
00586   // if not, calculate gain ratio to determine best split
00587   else {
00588 
00589     if (SPLITDEBUG) cout << endl << "Creating new decision node" << endl;
00590 
00591     //node->leaf = false;
00592     //node->nInstances++;
00593 
00594     float bestGainRatio = -1.0;
00595     int bestDim = -1;
00596     float bestVal = -1;
00597     bool bestType = 0;
00598     std::vector<tree_experience*> bestLeft;
00599     std::vector<tree_experience*> bestRight;
00600 
00601     testPossibleSplits(instances, &bestGainRatio, &bestDim, &bestVal, &bestType, &bestLeft, &bestRight);
00602 
00603     return implementSplit(node, bestGainRatio, bestDim, bestVal, bestType, bestLeft, bestRight, changed);
00604 
00605   }
00606 
00607 }
00608 
00609 
00610 bool C45Tree::makeLeaf(tree_node* node){
00611 
00612   // check on children
00613   if (node->l != NULL){
00614     deleteTree(node->l);
00615     deallocateNode(node->l);
00616     node->l = NULL;
00617   }
00618 
00619   if (node->r != NULL){
00620     deleteTree(node->r);
00621     deallocateNode(node->r);
00622     node->r = NULL;
00623   }
00624 
00625   // changed from not leaf to leaf, or just init'd
00626   bool change = (!node->leaf || node->type < 0);
00627 
00628   node->leaf = true;
00629   node->type = 0;
00630 
00631   return change;
00632 }
00633 
00634 bool C45Tree::implementSplit(tree_node* node, float bestGainRatio, int bestDim,
00635                              float bestVal, bool bestType,
00636                              const std::vector<tree_experience*> &bestLeft, 
00637                              const std::vector<tree_experience*> &bestRight,
00638                              bool changed){
00639   if (DTDEBUG) cout << "implementSplit node=" << node->id << ",gainRatio=" << bestGainRatio
00640                     << ",dim=" << bestDim
00641                     << ",val=" << bestVal << ",type=" << bestType 
00642                     << ",chg=" << changed << endl;
00643 
00644 
00645   // see if this should still be a leaf node
00646   if (bestGainRatio < MIN_GAIN_RATIO){
00647     bool change = makeLeaf(node);
00648     if (SPLITDEBUG || STOCH_DEBUG){
00649       cout << "DT " << id << " Node " << node->id << " Poor gain ratio: "
00650            << bestGainRatio << ", " << node->nInstances
00651            << " instances classified at leaf " << node->id
00652            << " with multiple outputs " << endl;
00653     }
00654     return change;
00655   }
00656 
00657   // see if this split changed or not
00658   // assuming no changes above
00659   if (!changed && node->dim == bestDim && node->val == bestVal
00660       && node->type == bestType && !node->leaf
00661       && node->l != NULL && node->r != NULL){
00662     // same split as before.
00663     if (DTDEBUG || SPLITDEBUG) cout << "Same split as before" << endl;
00664     bool changeL = false;
00665     bool changeR = false;
00666 
00667     // see which leaf changed
00668     if (bestLeft.size() > (unsigned)node->l->nInstances){
00669       // redo left side
00670       if (DTDEBUG) cout << "Rebuild left side of tree" << endl;
00671       changeL = buildTree(node->l, bestLeft, changed);
00672     }
00673 
00674     if (bestRight.size() > (unsigned)node->r->nInstances){
00675       // redo right side
00676       if (DTDEBUG) cout << "Rebuild right side of tree" << endl;
00677       changeR = buildTree(node->r, bestRight, changed);
00678     }
00679 
00680     // everything up to here is the same, check if there were changes below
00681     return (changeL || changeR);
00682   }
00683 
00684   // totally new
00685   // set the best split here
00686   node->leaf = false;
00687   node->dim = bestDim;
00688   node->val = bestVal;
00689   node->type = bestType;
00690 
00691   if (SPLITDEBUG) cout << "Best split was type " << node->type
00692                        << " with val " << node->val
00693                        << " on dim " << node->dim
00694                        << " with gainratio: " << bestGainRatio << endl;
00695 
00696   if (DTDEBUG) cout << "Left has " << bestLeft.size()
00697                     << ", right has " << bestRight.size() << endl;
00698 
00699   // make sure both instances
00700   if (bestLeft.size() == 0 || bestRight.size() == 0){
00701     cout << "ERROR: DT " << id << " node " << node->id << " has 0 instances: left: " << bestLeft.size()
00702          << " right: " << bestRight.size() << endl;
00703     cout << "Split was type " << node->type
00704          << " with val " << node->val
00705          << " on dim " << node->dim
00706          << " with gainratio: " << bestGainRatio << endl;
00707     exit(-1);
00708   }
00709 
00710 
00711   // check if these already exist
00712   if (node->l == NULL){
00713     if (DTDEBUG) cout << "Init new left tree nodes " << endl;
00714     node->l = allocateNode();
00715   }
00716   if (node->r == NULL){
00717     if (DTDEBUG) cout << "Init new right tree nodes " << endl;
00718     node->r = allocateNode();
00719   }
00720 
00721   // recursively build the sub-trees to this one
00722   if (DTDEBUG) cout << "Building left tree for node " << node->id << endl;
00723   buildTree(node->l, bestLeft, true);
00724   if (DTDEBUG) cout << "Building right tree for node " << node->id << endl;
00725   buildTree(node->r, bestRight, true);
00726 
00727   // this one changed, or above changed, no reason to check change of lower parts
00728   return true;
00729   
00730 }
00731 
00732 
00733 
00734 void C45Tree::testPossibleSplits(const std::vector<tree_experience*> &instances, 
00735                                  float *bestGainRatio, int *bestDim,
00736                                  float *bestVal, bool *bestType, 
00737                                  std::vector<tree_experience*> *bestLeft,
00738                                  std::vector<tree_experience*> *bestRight) {
00739   if (DTDEBUG) cout << "testPossibleSplits" << endl;
00740 
00741 
00742   // pre-calculate some stuff for these splits (namely I, P, C)
00743   float I = calcIforSet(instances);
00744   //if (DTDEBUG) cout << "I: " << I << endl;
00745 
00746   int nties = 0;
00747 
00748   // for each possible split, calc gain ratio
00749   for (unsigned idim = 0; idim < instances[0]->input.size(); idim++){
00750 
00751     //float* sorted = sortOnDim(idim, instances);
00752     float minVal, maxVal;
00753     std::set<float> uniques = getUniques(idim, instances, minVal, maxVal);
00754 
00755     for (std::set<float>::iterator j = uniques.begin(); j != uniques.end(); j++){
00756 
00757       // skip max val, not a valid cut for either
00758       if ((*j) == maxVal)
00759         continue;
00760       
00761       // if this is a random forest, we eliminate some random number of splits
00762       // here (decision is taken from the random set that are left)
00763       if (rng.uniform() < featPct)
00764         continue;
00765 
00766       std::vector<tree_experience*> left;
00767       std::vector<tree_experience*> right;
00768 
00769       // splits that are cuts
00770       float splitval = (*j);
00771       float gainRatio = calcGainRatio(idim, splitval, CUT, instances, I, left, right);
00772 
00773       if (SPLITDEBUG) cout << " CUT split val " << splitval
00774                            << " on dim: " << idim << " had gain ratio "
00775                            << gainRatio << endl;
00776 
00777       // see if this is the new best gain ratio
00778       compareSplits(gainRatio, idim, splitval, CUT, left, right, &nties,
00779                     bestGainRatio, bestDim, bestVal, bestType, bestLeft, bestRight);
00780 
00781 
00782       // no minval here, it would be the same as the cut split on minval
00783       if (ALLOW_ONLY_SPLITS && (*j) != minVal){
00784         // splits that are true only if this value is equal
00785         float splitval = (*j);
00786 
00787         float gainRatio = calcGainRatio(idim, splitval, ONLY, instances, I, left, right);
00788 
00789         if (SPLITDEBUG) cout << " ONLY split val " << splitval
00790                              << " on dim: " << idim << " had gain ratio "
00791                              << gainRatio << endl;
00792 
00793         // see if this is the new best gain ratio
00794         compareSplits(gainRatio, idim, splitval, ONLY, left, right, &nties,
00795                       bestGainRatio, bestDim, bestVal, bestType, bestLeft, bestRight);
00796 
00797       } // splits with only
00798 
00799     } // j loop
00800   }
00801 }
00802 
00803 
00804 
00805 void C45Tree::compareSplits(float gainRatio, int dim, float val, bool type,
00806                             const std::vector<tree_experience*> &left, 
00807                             const std::vector<tree_experience*> &right,
00808                             int *nties, float *bestGainRatio, int *bestDim,
00809                             float *bestVal, bool *bestType,
00810                             std::vector<tree_experience*> *bestLeft, std::vector<tree_experience*> *bestRight){
00811   if (DTDEBUG) cout << "compareSplits gainRatio=" << gainRatio << ",dim=" << dim
00812                     << ",val=" << val << ",type= " << type <<endl;
00813 
00814 
00815   bool newBest = false;
00816 
00817   // if its a virtual tie, break it randomly
00818   if (fabs(*bestGainRatio - gainRatio) < SPLIT_MARGIN){
00819     //cout << "Split tie, making random decision" << endl;
00820 
00821     (*nties)++;
00822     float randomval = rng.uniform(0,1);
00823     float newsplitprob = (1.0 / (float)*nties);
00824 
00825     if (randomval < newsplitprob){
00826       newBest = true;
00827       if (SPLITDEBUG) cout << "   Tie on split. DT: " << id << " rand: " << randomval
00828                            << " splitProb: " << newsplitprob << ", selecting new split " << endl;
00829     }
00830     else
00831       if (SPLITDEBUG) cout << "   Tie on split. DT: " << id << " rand: " << randomval
00832                            << " splitProb: " << newsplitprob << ", staying with old split " << endl;
00833   }
00834 
00835   // if its clearly better, set this as the best split
00836   else if (gainRatio > *bestGainRatio){
00837     newBest = true;
00838     *nties = 1;
00839   }
00840 
00841 
00842   // set the split features
00843   if (newBest){
00844     *bestGainRatio = gainRatio;
00845     *bestDim = dim;
00846     *bestVal = val;
00847     *bestType = type;
00848     *bestLeft = left;
00849     *bestRight = right;
00850     if (SPLITDEBUG){
00851       cout << "  New best gain ratio: " << *bestGainRatio
00852            << ": type " << *bestType
00853            << " with val " << *bestVal
00854            << " on dim " << *bestDim << endl;
00855     }
00856   } // newbest
00857 }
00858 
00859 float C45Tree::calcGainRatio(int dim, float val, bool type,
00860                              const std::vector<tree_experience*> &instances,
00861                              float I,
00862                              std::vector<tree_experience*> &left,
00863                              std::vector<tree_experience*> &right){
00864   if (DTDEBUG) cout << "calcGainRatio, dim=" << dim
00865                     << " val=" << val
00866                     << " I=" << I
00867                     << " nInstances= " << instances.size() << endl;
00868 
00869   left.clear();
00870   right.clear();
00871 
00872   // array with percentage positive and negative for this test
00873   float D[2];
00874 
00875   // info(T) = I(P): float I;
00876 
00877   // Info for this split = Info(X,T)
00878   float Info;
00879 
00880   // Gain for this split = Gain(X,T)
00881   float Gain;
00882 
00883   // SplitInfo for this split = I(|pos|/|T|, |neg|/|T|)
00884   float SplitInfo;
00885 
00886   // GainRatio for this split = GainRatio(X,T) = Gain(X,T) / SplitInfo(X,T)
00887   float GainRatio;
00888 
00889   // see where the instances would go with this split
00890   for (unsigned i = 0; i < instances.size(); i++){
00891     if (DTDEBUG) cout << "calcGainRatio - Classify instance " << i 
00892                       << " on new split " << endl;
00893 
00894     if (passTest(dim, val, type, instances[i]->input)){
00895       left.push_back(instances[i]);
00896     }
00897     else{
00898       right.push_back(instances[i]);
00899     }
00900   }
00901 
00902   if (DTDEBUG) cout << "Left has " << left.size()
00903                     << ", right has " << right.size() << endl;
00904 
00905   D[0] = (float)left.size() / (float)instances.size();
00906   D[1] = (float)right.size() / (float)instances.size();
00907   float leftInfo = calcIforSet(left);
00908   float rightInfo = calcIforSet(right);
00909   Info = D[0] * leftInfo + D[1] * rightInfo;
00910   Gain = I - Info;
00911   SplitInfo = calcIofP((float*)&D, 2);
00912   GainRatio = Gain / SplitInfo;
00913 
00914   if (DTDEBUG){
00915     cout << "LeftInfo: " << leftInfo
00916          << " RightInfo: " << rightInfo
00917          << " Info: " << Info
00918          << " Gain: " << Gain
00919          << " SplitInfo: " << SplitInfo
00920          << " GainRatio: " << GainRatio
00921          << endl;
00922   }
00923 
00924   return GainRatio;
00925 
00926 }
00927 
00928 float C45Tree::calcIofP(float* P, int size){
00929   if (DTDEBUG) cout << "calcIofP, size=" << size << endl;
00930   float I = 0;
00931   for (int i = 0; i < size; i++){
00932     I -= P[i] * log(P[i]);
00933   }
00934   return I;
00935 }
00936 
00937 float C45Tree::calcIforSet(const std::vector<tree_experience*> &instances){
00938   if (DTDEBUG) cout << "calcIforSet" << endl;
00939 
00940   std::map<float, int> classes;
00941 
00942   // go through instances and figure count of each type
00943   for (unsigned i = 0; i < instances.size(); i++){
00944     // increment count for this value
00945     float val = instances[i]->output;
00946     classes[val]++;
00947   }
00948 
00949   // now calculate P
00950   float Pval;
00951   float I = 0;
00952   for (std::map<float, int>::iterator i = classes.begin(); i != classes.end(); i++){
00953     Pval = (float)(*i).second / (float)instances.size();
00954     // calc I of P
00955     I -= Pval * log(Pval);
00956   }
00957 
00958   return I;
00959 
00960 }
00961 
00962 std::set<float> C45Tree::getUniques(int dim, const std::vector<tree_experience*> &instances, float& minVal, float& maxVal){
00963   if (DTDEBUG) cout << "getUniques,dim = " << dim;
00964 
00965   std::set<float> uniques;
00966 
00967   for (int i = 0; i < (int)instances.size(); i++){
00968     if (i == 0 || instances[i]->input[dim] < minVal)
00969       minVal = instances[i]->input[dim];
00970     if (i == 0 || instances[i]->input[dim] > maxVal)
00971       maxVal = instances[i]->input[dim];
00972 
00973     uniques.insert(instances[i]->input[dim]);
00974   }
00975 
00976   if (DTDEBUG) cout << " #: " << uniques.size() << endl;
00977   return uniques;
00978 }
00979 
00980 
00981 float* C45Tree::sortOnDim(int dim, const std::vector<tree_experience*> &instances){
00982   if (DTDEBUG) cout << "sortOnDim,dim = " << dim << endl;
00983 
00984   float* values = new float[instances.size()];
00985 
00986   for (int i = 0; i < (int)instances.size(); i++){
00987     //cout << "Instance " << i << endl;
00988 
00989     float val = instances[i]->input[dim];
00990     //cout << " val: " << val << endl;
00991 
00992     // find where this should go
00993     for (int j = 0; j <= i; j++){
00994       //cout << " j: " << j << endl;
00995 
00996       // get to i, this is the spot then
00997       if (j==i){
00998         values[j] = val;
00999         //cout << "  At i, putting value in slot j: " << j << endl;
01000       }
01001 
01002       // if this is the spot
01003       else if (val < values[j]){
01004         //cout << "  Found slot at j: " << j << endl;
01005 
01006         // slide everything forward to make room
01007         for (int k = i; k > j; k--){
01008           //cout << "   k = " << k << " Sliding value from k-1 to k" << endl;
01009           values[k] = values[k-1];
01010         }
01011 
01012         // put value in its spot at j
01013         //cout << "  Putting value at slot j: " << j << endl;
01014         values[j] = val;
01015 
01016         // break
01017         break;
01018       }
01019 
01020     }
01021   }
01022 
01023   if (DTDEBUG){
01024     cout << "Sorted array: " << values[0];
01025     for (int i = 1; i < (int)instances.size(); i++){
01026       cout << ", " << values[i];
01027     }
01028     cout << endl;
01029   }
01030 
01031   return values;
01032 
01033 }
01034 
01035 
01036 void C45Tree::printTree(tree_node *t, int level){
01037 
01038   for (int i = 0; i < level; i++){
01039     cout << ".";
01040   }
01041 
01042   cout << "Node " << t->id;
01043   if (t->type == C45Tree::CUT) cout << " Type: CUT"; 
01044   else                cout << " Type: ONLY";
01045   cout << " Dim: " << t->dim << " Val: " << t->val
01046        << " nInstances: " << t->nInstances ;
01047   
01048   if (t->leaf){
01049     cout << " Outputs: ";
01050     for (std::map<float, int>::iterator j = t->outputs.begin();
01051          j != t->outputs.end(); j++){
01052       cout << (*j).first << ": " << (*j).second << ", ";
01053     }
01054     cout << endl;
01055   }
01056   else
01057     cout << " Left: " << t->l->id << " Right: " << t->r->id << endl;
01058 
01059 
01060   // print children
01061   if (t->dim != -1 && !t->leaf){
01062     printTree(t->l, level+1);
01063     printTree(t->r, level+1);
01064   }
01065 
01066 }
01067 
01068 
01069 
01070 
01071 
01072 // output a map of outcomes and their probabilities for this leaf node
01073 void C45Tree::outputProbabilities(tree_node* leaf, std::map<float, float>* retval){
01074   if (STOCH_DEBUG) cout << "Calculating output probs for leaf " << leaf->id << endl;
01075 
01076   // go through all output values at this leaf, turn into probabilities
01077   for (std::map<float, int>::iterator it = leaf->outputs.begin();
01078        it != leaf->outputs.end(); it++){
01079 
01080     float val = (*it).first;
01081     float count = (float)(*it).second;
01082     if (count > 0)
01083       (*retval)[val] = count / (float)leaf->nInstances;
01084 
01085     if (STOCH_DEBUG) 
01086       cout << "Output value " << val << " had count of " << count << " on "
01087            << leaf->nInstances <<" instances and prob of " 
01088            << (*retval)[val] << endl;
01089   }
01090 
01091 }
01092 
01093 
01094 void C45Tree::initNodes(){
01095 
01096   for (int i = 0; i < N_C45_NODES; i++){
01097     initTreeNode(&(allNodes[i]));
01098     freeNodes.push_back(i);
01099     if (NODEDEBUG) 
01100       cout << "init node " << i << " with id " << allNodes[i].id 
01101            << ", now " << freeNodes.size() << " free nodes." << endl;
01102   }
01103 
01104 }
01105 
01106 C45Tree::tree_node* C45Tree::allocateNode(){
01107   if (freeNodes.empty()){
01108     tree_node* newNode = new tree_node;
01109     initTreeNode(newNode);
01110     if (NODEDEBUG) 
01111       cout << id << " PROBLEM: No more pre-allocated nodes!!!" << endl
01112            << "return new node " << newNode->id 
01113            << ", now " << freeNodes.size() << " free nodes." << endl;
01114     return newNode;
01115   }
01116 
01117   int i = freeNodes.back();
01118   freeNodes.pop_back();
01119   if (NODEDEBUG) 
01120     cout << id << " allocate node " << i << " with id " << allNodes[i].id 
01121          << ", now " << freeNodes.size() << " free nodes." << endl;
01122   return &(allNodes[i]);
01123 }
01124 
01125 void C45Tree::deallocateNode(tree_node* node){
01126 
01127   if (node->id >= N_C45_NODES){
01128     if (NODEDEBUG) 
01129       cout << id << " dealloc extra node id " << node->id 
01130            << ", now " << freeNodes.size() << " free nodes." << endl;
01131     delete node;
01132     return;
01133   }
01134 
01135   freeNodes.push_back(node->id);
01136   if (NODEDEBUG) 
01137     cout << id << " dealloc node " << node->id 
01138          << ", now " << freeNodes.size() << " free nodes." << endl;
01139 }
01140 


rl_agent
Author(s): Todd Hester
autogenerated on Thu Jun 6 2019 22:00:13