NikosFirst.java

 
1
/*
2
 *    This program is free software; you can redistribute it and/or modify
3
 *    it under the terms of the GNU General Public License as published by
4
 *    the Free Software Foundation; either version 2 of the License, or
5
 *    (at your option) any later version.
6
 *
7
 *    This program is distributed in the hope that it will be useful,
8
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
 *    GNU General Public License for more details.
11
 *
12
 *    You should have received a copy of the GNU General Public License
13
 *    along with this program; if not, write to the Free Software
14
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15
 */
16

    
17
/*
18
 *    Resample.java
19
 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20
 *
21
 */
22

    
23
package weka.filters.supervised.instance;
24

    
25
import weka.core.Capabilities;
26
import weka.core.Instance;
27
import weka.core.Instances;
28
import weka.core.Option;
29
import weka.core.OptionHandler;
30
import weka.core.RevisionUtils;
31
import weka.core.Utils;
32
import weka.core.Capabilities.Capability;
33
import weka.filters.Filter;
34
import weka.filters.SupervisedFilter;
35

    
36
import java.util.Collections;
37
import java.util.Enumeration;
38
import java.util.Random;
39
import java.util.Vector;
40

    
41
/** 
42
 <!-- globalinfo-start -->
43
 * Produces a random subsample of a dataset using either sampling with replacement or without replacement.<br/>
44
 * The original dataset must fit entirely in memory. The number of instances in the generated dataset may be specified. The dataset must have a nominal class attribute. If not, use the unsupervised version. The filter can be made to maintain the class distribution in the subsample, or to bias the class distribution toward a uniform distribution. When used in batch mode (i.e. in the FilteredClassifier), subsequent batches are NOT resampled.
45
 * <p/>
46
 <!-- globalinfo-end -->
47
 * 
48
 <!-- options-start -->
49
 * Valid options are: <p/>
50
 * 
51
 * <pre> -S &lt;num&gt;
52
 *  Specify the random number seed (default 1)</pre>
53
 * 
54
 * <pre> -Z &lt;num&gt;
55
 *  The size of the output dataset, as a percentage of
56
 *  the input dataset (default 100)</pre>
57
 * 
58
 * <pre> -B &lt;num&gt;
59
 *  Bias factor towards uniform class distribution.
60
 *  0 = distribution in input data -- 1 = uniform distribution.
61
 *  (default 0)</pre>
62
 * 
63
 * <pre> -no-replacement
64
 *  Disables replacement of instances
65
 *  (default: with replacement)</pre>
66
 * 
67
 * <pre> -V
68
 *  Inverts the selection - only available with '-no-replacement'.</pre>
69
 * 
70
 <!-- options-end -->
71
 *
72
 * @author Len Trigg (len@reeltwo.com)
73
 * @author FracPete (fracpete at waikato dot ac dot nz)
74
 * @version $Revision: 5542 $ 
75
 */
76
public class NikosFirst
77
  extends Filter 
78
  implements SupervisedFilter, OptionHandler {
79
  
80
  /** for serialization. */
81
  static final long serialVersionUID = 7079064953548300681L;
82

    
83
  /** The subsample size, percent of original set, default 100%. */
84
  // Nikos             protected double m_SampleSizePercent = 100;
85
  
86
  /** The random number generator seed. */
87
  protected int m_RandomSeed = 1;
88
  
89
  /** The degree of bias towards uniform (nominal) class distribution. */
90
 // protected double m_BiasToUniformClass = 0;
91

    
92
  /** Whether to perform sampling with replacement or without. */
93
  //protected boolean m_NoReplacement = false;
94

    
95
  /** Whether to invert the selection (only if instances are drawn WITHOUT 
96
   * replacement).
97
   * @see #m_NoReplacement */
98
  //protected boolean m_InvertSelection = false;
99

    
100
  /**
101
   * Returns a string describing this filter.
102
   *
103
   * @return This is the first attempt to write my own filter
104
   * Later Goal is to construct ENN-rule algorithm as a weka filter.
105
   */
106
  public String globalInfo() {
107
    return 
108
        "This are the words to display"
109
        + "and that is another row";
110
  }
111

    
112
  /**
113
   * Returns an enumeration describing the available options.
114
   *
115
   * @return an enumeration of all the available options.
116
   */
117
  public Enumeration listOptions() {
118
    Vector result = new Vector();
119

    
120
    result.addElement(new Option(
121
        "\tSpecify the random number seed (default 1)",
122
        "S", 1, "-S <num>"));
123

    
124
    result.addElement(new Option(
125
        "\tThe size of the output dataset, as a percentage of\n"
126
        +"\tthe input dataset (default 100)",
127
        "Z", 1, "-Z <num>"));
128

    
129
    result.addElement(new Option(
130
        "\tBias factor towards uniform class distribution.\n"
131
        +"\t0 = distribution in input data -- 1 = uniform distribution.\n"
132
        +"\t(default 0)",
133
        "B", 1, "-B <num>"));
134

    
135
    result.addElement(new Option(
136
        "\tDisables replacement of instances\n"
137
        +"\t(default: with replacement)",
138
        "no-replacement", 0, "-no-replacement"));
139

    
140
    result.addElement(new Option(
141
        "\tInverts the selection - only available with '-no-replacement'.",
142
        "V", 0, "-V"));
143

    
144
    return result.elements();
145
  }
146

    
147

    
148
  /**
149
   * Parses a given list of options. <p/>
150
   * 
151
   <!-- options-start -->
152
   * Valid options are: <p/>
153
   * 
154
   * <pre> -S &lt;num&gt;
155
   *  Specify the random number seed (default 1)</pre>
156
   * 
157
   * <pre> -Z &lt;num&gt;
158
   *  The size of the output dataset, as a percentage of
159
   *  the input dataset (default 100)</pre>
160
   * 
161
   * <pre> -B &lt;num&gt;
162
   *  Bias factor towards uniform class distribution.
163
   *  0 = distribution in input data -- 1 = uniform distribution.
164
   *  (default 0)</pre>
165
   * 
166
   * <pre> -no-replacement
167
   *  Disables replacement of instances
168
   *  (default: with replacement)</pre>
169
   * 
170
   * <pre> -V
171
   *  Inverts the selection - only available with '-no-replacement'.</pre>
172
   * 
173
   <!-- options-end -->
174
   *
175
   * @param options the list of options as an array of strings
176
   * @throws Exception if an option is not supported
177
   */
178
  public void setOptions(String[] options) throws Exception {
179
    String        tmpStr;
180
    
181
    tmpStr = Utils.getOption('S', options);
182
    if (tmpStr.length() != 0)
183
      setRandomSeed(Integer.parseInt(tmpStr));
184
    else
185
      setRandomSeed(1);
186

    
187
    if (getInputFormat() != null) {
188
      setInputFormat(getInputFormat());
189
    }
190
  }
191

    
192
  
193
  
194
  /**
195
   * Gets the current settings of the filter.
196
   *
197
   * @return an array of strings suitable for passing to setOptions
198
   */
199
  
200
  
201
  
202
  
203
  
204
  public String [] getOptions() {
205
    Vector<String>        result;
206

    
207
    result = new Vector<String>();
208

    
209

    
210
    result.add("-S");
211
    result.add("" + getRandomSeed());
212

    
213
    
214
    return result.toArray(new String[result.size()]);
215
  }
216
    
217
  
218
  
219
  
220
  
221
  
222
 
223
  
224
  /**
225
   * Returns the tip text for this property.
226
   *
227
   * @return tip text for this property suitable for
228
   * displaying in the explorer/experimenter gui
229
   */
230
  public String randomSeedTipText() {
231
    return "Sets the random number seed for subsampling.";
232
  }
233
  
234
  /**
235
   * Gets the random number seed.
236
   *
237
   * @return the random number seed.
238
   */
239
  public int getRandomSeed() {
240
    return m_RandomSeed;
241
  }
242
  
243
  /**
244
   * Sets the random number seed.
245
   *
246
   * @param newSeed the new random number seed.
247
   */
248
  public void setRandomSeed(int newSeed) {
249
    m_RandomSeed = newSeed;
250
  }
251
    
252
  
253
  
254
  
255
  
256
  
257
  
258
 
259

    
260
  /** 
261
   * Returns the Capabilities of this filter.
262
   *
263
   * @return            the capabilities of this object
264
   * @see               Capabilities
265
   */
266
  public Capabilities getCapabilities() {
267
    Capabilities result = super.getCapabilities();
268
    result.disableAll();
269

    
270
    // attributes
271
    result.enableAllAttributes();
272
    result.enable(Capability.MISSING_VALUES);
273
    
274
    // class
275
    result.enable(Capability.NOMINAL_CLASS);
276
    
277
    return result;
278
  }
279
  
280
  
281
  
282
  
283
  
284
  
285
  
286
  
287
  
288
  /**
289
   * Sets the format of the input instances.
290
   *
291
   * @param instanceInfo an Instances object containing the input 
292
   * instance structure (any instances contained in the object are 
293
   * ignored - only the structure is required).
294
   * @return true if the outputFormat may be collected immediately
295
   * @throws Exception if the input format can't be set 
296
   * successfully
297
   */
298
  public boolean setInputFormat(Instances instanceInfo) 
299
       throws Exception {
300

    
301
    super.setInputFormat(instanceInfo);
302
    setOutputFormat(instanceInfo);
303
    return true;
304
  }
305

    
306
  
307
  
308
  
309
  
310
  
311
  
312
  
313
  
314
  /**
315
   * Input an instance for filtering. Filter requires all
316
   * training instances be read before producing output.
317
   *
318
   * @param instance the input instance
319
   * @return true if the filtered instance may now be
320
   * collected with output().
321
   * @throws IllegalStateException if no input structure has been defined
322
   */
323
  public boolean input(Instance instance) {
324

    
325
    if (getInputFormat() == null) {
326
      throw new IllegalStateException("No input instance format defined");
327
    }
328
    if (m_NewBatch) {
329
      resetQueue();
330
      m_NewBatch = false;
331
    }
332
    if (isFirstBatchDone()) {
333
      push(instance);
334
      return true;
335
    } else {
336
      bufferInput(instance);
337
      return false;
338
    }
339
  }
340
  
341
  
342
  
343
  
344
  
345
  
346
  
347
  
348
  
349
  
350
  
351
  
352

    
353

    
354

    
355
  /**
356
   * creates the subsample without replacement.
357
   * 
358
   * @param random                the random number generator to use
359
   * @param origSize                the original size of the dataset
360
   * @param sampleSize                the size to generate
361
   * @param actualClasses        the number of classes found in the data
362
   * @param classIndices        the indices where classes start
363
   */
364
  
365
    
366

    
367

    
368

    
369

    
370

    
371
    // Create the new sample
372
    Random random = new Random(m_RandomSeed);
373

    
374
    
375
    
376
    
377
    public boolean batchFinished() {
378

    
379
        if (getInputFormat() == null) {
380
          throw new IllegalStateException("No input instance format defined");
381
        }
382

    
383
        if (!isFirstBatchDone()) {
384
          // Do the subsample, and clear the input instances.
385
          createSubsample();
386
          System.out.println("Seed is: "+ m_RandomSeed);
387
        }
388
        flushInput();
389

    
390
        m_NewBatch = true;
391
        m_FirstBatchDone = true;
392
        return (numPendingOutput() != 0);
393
      }
394
    
395
    
396
    
397
    protected void createSubsample() {
398
        int origSize = getInputFormat().numInstances();
399

    
400

    
401

    
402

    
403
        // Sort according to class attribute.
404
        getInputFormat().sort(getInputFormat().classIndex());
405
        
406
        // Create an index of where each class value starts
407
        int[] classIndices = new int [getInputFormat().numClasses() + 1];
408
        int currentClass = 0;
409
        classIndices[currentClass] = 0;
410
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
411
          Instance current = getInputFormat().instance(i);
412
          if (current.classIsMissing()) {
413
            for (int j = currentClass + 1; j < classIndices.length; j++) {
414
              classIndices[j] = i;
415
            }
416
            break;
417
          } else if (current.classValue() != currentClass) {
418
            for (int j = currentClass + 1; j <= current.classValue(); j++) {
419
              classIndices[j] = i;
420
            }          
421
            currentClass = (int) current.classValue();
422
          }
423
        }
424
        if (currentClass <= getInputFormat().numClasses()) {
425
          for (int j = currentClass + 1; j < classIndices.length; j++) {
426
            classIndices[j] = getInputFormat().numInstances();
427
          }
428
        }
429
        
430
        int actualClasses = 0;
431
        for (int i = 0; i < classIndices.length - 1; i++) {
432
          if (classIndices[i] != classIndices[i + 1]) {
433
            actualClasses++;
434
          }
435
        }}
436

    
437

    
438
        
439
    
440
    
441
    
442
    
443
    
444
    
445
    
446
    
447
    
448
    
449
    
450
    
451
    
452
    
453
    
454
    
455
    
456
    
457
    
458
    
459
    
460
    
461
    
462
    
463
  
464
  /**
465
   * Returns the revision string.
466
   * 
467
   * @return                the revision
468
   */
469
  public String getRevision() {
470
    return RevisionUtils.extract("$Revision: 5542 $");
471
  }
472
  
473
  /**
474
   * Main method for testing this class.
475
   *
476
   * @param argv should contain arguments to the filter: 
477
   * use -h for help
478
   */
479
  public static void main(String [] argv) {
480
    runFilter(new Resample(), argv);
481
  }
482
}