Marsyas  0.2
/home/gperciva/src/marsyas/src/marsyas/WekaData.cpp
00001 #include "WekaData.h"
00002 
00003 using namespace std;
00004 using namespace Marsyas;
00005 
00006 //This class represents a collection of row data read from a weka arff file.
00007 //It is organized as a vector collection of vector pointers.
00008 //It is optimized for fast sorting and shuffling of the data. It is not intended
00009 //that the data change once it is loaded.
00010 //
00011 //It is also assumed that the last column of each row is the class attribute.
00012 //All data items are mrs_real, including the class attribute, however the class
00013 //attribute should be interpreted as an mrs_natural.
00014 WekaData::WekaData():cols_(0),rows_(0), isFold_(false)
00015 {
00016 }
00017 
00018 WekaData::~WekaData()
00019 {
00020 
00021     // if it is a fold then the pointers refers 
00022     // to rows in the original data so the data 
00023     // they point to doesn't need to be deallocated 
00024     // The "original" WekaData for which the folds 
00025     // where computed takes care of it 
00026     if (!isFold_)                       
00027         Clear();
00028 }
00029 
00030 void 
00031 WekaData::setFold(bool isFold) 
00032 {
00033     isFold_ = isFold;
00034 }
00035 
00036 
00037 //create the table. Will clear contents first and fix the number of columns.
00038 void WekaData::Create(mrs_natural cols)
00039 {
00040     MRSASSERT(cols>=0);
00041     this->Clear();
00042     cols_ = cols;
00043     rows_ = 0;
00044 }
00045 
00046 //clear all data from the table
00047 //Requires that the vector rows be freed
00048 void WekaData::Clear()
00049 {
00050     if (rows_ > 0) {
00051         vector<vector<mrs_real>*>::iterator iter = this->begin();
00052         while (iter != this->end()) {
00053             delete (*iter);
00054             this->erase(iter);
00055         }
00056     }
00057     this->clear();
00058     filenames_.clear();
00059     
00060 }//Clear
00061 
00062 
00063 void 
00064 WekaData::NormMaxMinRow(realvec& in)
00065 {
00066   int ii;
00067   for(ii=0; ii<(int)in.getSize()-1; ++ii)
00068     {
00069       in(ii) =  (in(ii) - minimums_(ii)) / (maximums_(ii) - minimums_(ii));
00070     }
00071 }
00072 
00073 void 
00074 WekaData::NormMaxMin()
00075 {
00076   minimums_.create(cols_-1);
00077   maximums_.create(cols_-1);
00078   maximums_.setval(DBL_MIN);
00079   minimums_.setval(DBL_MAX);
00080   
00081   // find minimums_ and maximums_ 
00082   for(vector<vector<mrs_real>*>::const_iterator citer = this->begin(); citer!=this->end(); citer++)
00083     {
00084       const vector<mrs_real> *row = (*citer);
00085       int ii;
00086       for(ii=0; ii<(int)row->size()-1; ++ii)
00087     {
00088       if (row->at(ii) > maximums_(ii))
00089         maximums_(ii) = row->at(ii);
00090       if (row->at(ii) < minimums_(ii))
00091         minimums_(ii) = row->at(ii);
00092     }
00093     }
00094   
00095 
00096   // normalize 
00097   for(vector<vector<mrs_real>*>::const_iterator citer = this->begin(); citer!=this->end(); citer++)
00098     {
00099       vector<mrs_real> *row = (*citer);
00100       int ii;
00101       for(ii=0; ii<(int)row->size()-1; ++ii)
00102       {
00103           // don't divide by zero 
00104           if (maximums_(ii) - minimums_(ii) == 0)
00105               row->at(ii) = 0;
00106           else 
00107               row->at(ii) =  ((row->at(ii) - minimums_(ii)) / (maximums_(ii) - minimums_(ii)));
00108       }
00109     }
00110   
00111 
00112 
00113 
00114 }
00115 
00116 mrs_realvec WekaData::GetMinimums() const
00117 {
00118     return minimums_;
00119 }
00120  
00121 mrs_realvec WekaData::GetMaximums() const
00122 {
00123     return maximums_;
00124 }
00125 
00126 
00127 //randomly shuffle the data in the table
00128 //Need only to swap the pointers to row data, nice and fast!
00129 void WekaData::Shuffle()
00130 {
00131     srand(0);
00132     
00133     mrs_natural size = this->size()-1;
00134     for (mrs_natural ii=0; ii<size; ++ii)
00135     {
00136         mrs_natural rind = (mrs_natural)(((mrs_real)rand() / (mrs_real)(RAND_MAX))*size);
00137         //swap row ii with row rind
00138         swapRows(ii, rind);
00139     }//for ii
00140 }//Shuffle
00141 
00142 //SwapRows will exchange one row for another.
00143 //Just need to swap the 2 vector pointers.
00144 void WekaData::swapRows(mrs_natural l, mrs_natural r)
00145 {
00146     vector<mrs_real> *temp = this->at(l);
00147     this->at(l) = this->at(r);
00148     this->at(r) = temp;
00149 }
00150 
00151 mrs_natural WekaData::partition(mrs_natural attIndex, mrs_natural l, mrs_natural r)
00152 {
00153     mrs_real pivot = this->at((l+r)/2)->at(attIndex);
00154     while (l < r)
00155     {
00156         while ((this->at(l)->at(attIndex) < pivot) && (l < r))
00157         {
00158             l++;
00159         }//while
00160 
00161         while ((this->at(r)->at(attIndex) > pivot) && (l < r))
00162         {
00163             r--;
00164         }//while
00165 
00166         if (l < r)
00167         {
00168             swapRows(l, r);
00169             l++;
00170             r--;
00171         }//if
00172     }
00173     if ((l == r) && (this->at(r)->at(attIndex) > pivot))
00174     {
00175         r--;
00176     } //if
00177 
00178     return r;
00179 }//partition
00180 
00189   //@ requires 0 <= attIndex && attIndex < numAttributes();
00190   //@ requires 0 <= first && first <= right && right < numInstances();
00191   //Shamelessly ripped off from the weka library of code. - dale
00192 void WekaData::quickSort(mrs_natural attIndex, mrs_natural left, mrs_natural right)
00193 {
00194     if (left < right)
00195     {
00196         int middle = partition(attIndex, left, right);
00197         quickSort(attIndex, left, middle);
00198         quickSort(attIndex, middle + 1, right);
00199     }//if
00200 }//quicksort
00201 
00202 //Sort the instances dataset based on the column attr
00203 //Note that the entire table must be sorted on the attribute,
00204 //not just the attribute itself.
00205 void WekaData::Sort(mrs_natural attr)
00206 {
00207     MRSASSERT(attr>=0&&attr<cols_);
00208     quickSort(attr, 0, this->size()-1);
00209 }
00210 
00211 //add rows of data to the table
00212 void WekaData::Append(const realvec& in)
00213 {
00214     MRSASSERT(in.getRows()==cols_);
00215     // skip feature vectors labeled with negative labels
00216     
00217     if (in(in.getRows()-1, 0) >=0)
00218     {
00219         data_ = new vector<mrs_real>(cols_);
00220         for(mrs_natural ii=0; ii<in.getRows(); ++ii)
00221         {
00222             data_->at(ii) = in(ii, 0);
00223         }
00224         Append(data_);
00225     }
00226 
00227 }
00228 
00229 
00230 
00231 
00232 //add rows of data to the table
00233 void WekaData::Append(vector<mrs_real> *data)
00234 {
00235   MRSASSERT(data!=NULL && (int)data->size()==cols_);
00236   rows_++;
00237   
00238   this->push_back(data);
00239 }//Append
00240 
00241 
00242 //add rows of data to the table
00243 void WekaData::AppendFilename(mrs_string fname)
00244 {
00245   filenames_.push_back(fname);
00246 }//AppendFilename
00247 
00248 mrs_string WekaData::GetFilename(mrs_natural row) const
00249 {
00250   return (mrs_string)filenames_.at(row);
00251 }
00252 
00253 //get the class attribute for a row and convert to a int
00254 //class attribute is last column of row
00255 mrs_natural WekaData::GetClass(mrs_natural row) const
00256 {
00257     return (mrs_natural)this->at(row)->at(cols_-1);
00258 }
00259 
00260 //debug helper funtion to dump table to an ascii file
00261 void WekaData::Dump(const mrs_string& filename, const vector<mrs_string>& classNames) const
00262 {
00263     char buffer[32];
00264 
00265     ofstream *mis = new ofstream;
00266 
00267     mis->open(filename.c_str(), ios_base::out | ios_base::trunc );
00268     MRSASSERT( mis->is_open() );
00269 
00270     for(vector<vector<mrs_real>*>::const_iterator citer = this->begin(); citer!=this->end(); citer++)
00271     {
00272         bool first = true;
00273         const vector<mrs_real> *row = (*citer);
00274         int ii;
00275         for(ii=0; ii<(int)row->size()-1; ++ii)
00276         {
00277             if(!first)
00278                 mis->write(", ", 2);
00279             first = false;
00280 
00281             sprintf(buffer, "%09.4f", row->at(ii));
00282             mis->write(buffer, strlen(buffer));
00283         }
00284         mis->write(", ", 2);
00285         mrs_natural classIndex = (mrs_natural)row->at(ii);
00286         mis->write(classNames[classIndex].c_str(), strlen(classNames[classIndex].c_str()));
00287         mis->write("\n", 1);
00288     }
00289 
00290     mis->close();
00291     delete mis;
00292 }//Dump