NikosFirst.java
1 |
/*
|
---|---|
2 |
* This program is free software; you can redistribute it and/or modify
|
3 |
* it under the terms of the GNU General Public License as published by
|
4 |
* the Free Software Foundation; either version 2 of the License, or
|
5 |
* (at your option) any later version.
|
6 |
*
|
7 |
* This program is distributed in the hope that it will be useful,
|
8 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10 |
* GNU General Public License for more details.
|
11 |
*
|
12 |
* You should have received a copy of the GNU General Public License
|
13 |
* along with this program; if not, write to the Free Software
|
14 |
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
15 |
*/
|
16 |
|
17 |
/*
|
18 |
* Resample.java
|
19 |
* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
|
20 |
*
|
21 |
*/
|
22 |
|
23 |
package weka.filters.supervised.instance; |
24 |
|
25 |
import weka.core.Capabilities; |
26 |
import weka.core.Instance; |
27 |
import weka.core.Instances; |
28 |
import weka.core.Option; |
29 |
import weka.core.OptionHandler; |
30 |
import weka.core.RevisionUtils; |
31 |
import weka.core.Utils; |
32 |
import weka.core.Capabilities.Capability; |
33 |
import weka.filters.Filter; |
34 |
import weka.filters.SupervisedFilter; |
35 |
|
36 |
import java.util.Collections; |
37 |
import java.util.Enumeration; |
38 |
import java.util.Random; |
39 |
import java.util.Vector; |
40 |
|
41 |
/**
|
42 |
<!-- globalinfo-start -->
|
43 |
* Produces a random subsample of a dataset using either sampling with replacement or without replacement.<br/>
|
44 |
* The original dataset must fit entirely in memory. The number of instances in the generated dataset may be specified. The dataset must have a nominal class attribute. If not, use the unsupervised version. The filter can be made to maintain the class distribution in the subsample, or to bias the class distribution toward a uniform distribution. When used in batch mode (i.e. in the FilteredClassifier), subsequent batches are NOT resampled.
|
45 |
* <p/>
|
46 |
<!-- globalinfo-end -->
|
47 |
*
|
48 |
<!-- options-start -->
|
49 |
* Valid options are: <p/>
|
50 |
*
|
51 |
* <pre> -S <num>
|
52 |
* Specify the random number seed (default 1)</pre>
|
53 |
*
|
54 |
* <pre> -Z <num>
|
55 |
* The size of the output dataset, as a percentage of
|
56 |
* the input dataset (default 100)</pre>
|
57 |
*
|
58 |
* <pre> -B <num>
|
59 |
* Bias factor towards uniform class distribution.
|
60 |
* 0 = distribution in input data -- 1 = uniform distribution.
|
61 |
* (default 0)</pre>
|
62 |
*
|
63 |
* <pre> -no-replacement
|
64 |
* Disables replacement of instances
|
65 |
* (default: with replacement)</pre>
|
66 |
*
|
67 |
* <pre> -V
|
68 |
* Inverts the selection - only available with '-no-replacement'.</pre>
|
69 |
*
|
70 |
<!-- options-end -->
|
71 |
*
|
72 |
* @author Len Trigg (len@reeltwo.com)
|
73 |
* @author FracPete (fracpete at waikato dot ac dot nz)
|
74 |
* @version $Revision: 5542 $
|
75 |
*/
|
76 |
public class NikosFirst |
77 |
extends Filter |
78 |
implements SupervisedFilter, OptionHandler {
|
79 |
|
80 |
/** for serialization. */
|
81 |
static final long serialVersionUID = 7079064953548300681L; |
82 |
|
83 |
/** The subsample size, percent of original set, default 100%. */
|
84 |
// Nikos protected double m_SampleSizePercent = 100;
|
85 |
|
86 |
/** The random number generator seed. */
|
87 |
protected int m_RandomSeed = 1; |
88 |
|
89 |
/** The degree of bias towards uniform (nominal) class distribution. */
|
90 |
// protected double m_BiasToUniformClass = 0;
|
91 |
|
92 |
/** Whether to perform sampling with replacement or without. */
|
93 |
//protected boolean m_NoReplacement = false;
|
94 |
|
95 |
/** Whether to invert the selection (only if instances are drawn WITHOUT
|
96 |
* replacement).
|
97 |
* @see #m_NoReplacement */
|
98 |
//protected boolean m_InvertSelection = false;
|
99 |
|
100 |
/**
|
101 |
* Returns a string describing this filter.
|
102 |
*
|
103 |
* @return This is the first attempt to write my own filter
|
104 |
* Later Goal is to construct ENN-rule algorithm as a weka filter.
|
105 |
*/
|
106 |
public String globalInfo() { |
107 |
return
|
108 |
"This are the words to display"
|
109 |
+ "and that is another row";
|
110 |
} |
111 |
|
112 |
/**
|
113 |
* Returns an enumeration describing the available options.
|
114 |
*
|
115 |
* @return an enumeration of all the available options.
|
116 |
*/
|
117 |
public Enumeration listOptions() { |
118 |
Vector result = new Vector(); |
119 |
|
120 |
result.addElement(new Option( |
121 |
"\tSpecify the random number seed (default 1)",
|
122 |
"S", 1, "-S <num>")); |
123 |
|
124 |
result.addElement(new Option( |
125 |
"\tThe size of the output dataset, as a percentage of\n"
|
126 |
+"\tthe input dataset (default 100)",
|
127 |
"Z", 1, "-Z <num>")); |
128 |
|
129 |
result.addElement(new Option( |
130 |
"\tBias factor towards uniform class distribution.\n"
|
131 |
+"\t0 = distribution in input data -- 1 = uniform distribution.\n"
|
132 |
+"\t(default 0)",
|
133 |
"B", 1, "-B <num>")); |
134 |
|
135 |
result.addElement(new Option( |
136 |
"\tDisables replacement of instances\n"
|
137 |
+"\t(default: with replacement)",
|
138 |
"no-replacement", 0, "-no-replacement")); |
139 |
|
140 |
result.addElement(new Option( |
141 |
"\tInverts the selection - only available with '-no-replacement'.",
|
142 |
"V", 0, "-V")); |
143 |
|
144 |
return result.elements();
|
145 |
} |
146 |
|
147 |
|
148 |
/**
|
149 |
* Parses a given list of options. <p/>
|
150 |
*
|
151 |
<!-- options-start -->
|
152 |
* Valid options are: <p/>
|
153 |
*
|
154 |
* <pre> -S <num>
|
155 |
* Specify the random number seed (default 1)</pre>
|
156 |
*
|
157 |
* <pre> -Z <num>
|
158 |
* The size of the output dataset, as a percentage of
|
159 |
* the input dataset (default 100)</pre>
|
160 |
*
|
161 |
* <pre> -B <num>
|
162 |
* Bias factor towards uniform class distribution.
|
163 |
* 0 = distribution in input data -- 1 = uniform distribution.
|
164 |
* (default 0)</pre>
|
165 |
*
|
166 |
* <pre> -no-replacement
|
167 |
* Disables replacement of instances
|
168 |
* (default: with replacement)</pre>
|
169 |
*
|
170 |
* <pre> -V
|
171 |
* Inverts the selection - only available with '-no-replacement'.</pre>
|
172 |
*
|
173 |
<!-- options-end -->
|
174 |
*
|
175 |
* @param options the list of options as an array of strings
|
176 |
* @throws Exception if an option is not supported
|
177 |
*/
|
178 |
public void setOptions(String[] options) throws Exception { |
179 |
String tmpStr;
|
180 |
|
181 |
tmpStr = Utils.getOption('S', options);
|
182 |
if (tmpStr.length() != 0) |
183 |
setRandomSeed(Integer.parseInt(tmpStr));
|
184 |
else
|
185 |
setRandomSeed(1);
|
186 |
|
187 |
if (getInputFormat() != null) { |
188 |
setInputFormat(getInputFormat()); |
189 |
} |
190 |
} |
191 |
|
192 |
|
193 |
|
194 |
/**
|
195 |
* Gets the current settings of the filter.
|
196 |
*
|
197 |
* @return an array of strings suitable for passing to setOptions
|
198 |
*/
|
199 |
|
200 |
|
201 |
|
202 |
|
203 |
|
204 |
public String [] getOptions() { |
205 |
Vector<String> result; |
206 |
|
207 |
result = new Vector<String>(); |
208 |
|
209 |
|
210 |
result.add("-S");
|
211 |
result.add("" + getRandomSeed());
|
212 |
|
213 |
|
214 |
return result.toArray(new String[result.size()]); |
215 |
} |
216 |
|
217 |
|
218 |
|
219 |
|
220 |
|
221 |
|
222 |
|
223 |
|
224 |
/**
|
225 |
* Returns the tip text for this property.
|
226 |
*
|
227 |
* @return tip text for this property suitable for
|
228 |
* displaying in the explorer/experimenter gui
|
229 |
*/
|
230 |
public String randomSeedTipText() { |
231 |
return "Sets the random number seed for subsampling."; |
232 |
} |
233 |
|
234 |
/**
|
235 |
* Gets the random number seed.
|
236 |
*
|
237 |
* @return the random number seed.
|
238 |
*/
|
239 |
public int getRandomSeed() { |
240 |
return m_RandomSeed;
|
241 |
} |
242 |
|
243 |
/**
|
244 |
* Sets the random number seed.
|
245 |
*
|
246 |
* @param newSeed the new random number seed.
|
247 |
*/
|
248 |
public void setRandomSeed(int newSeed) { |
249 |
m_RandomSeed = newSeed; |
250 |
} |
251 |
|
252 |
|
253 |
|
254 |
|
255 |
|
256 |
|
257 |
|
258 |
|
259 |
|
260 |
/**
|
261 |
* Returns the Capabilities of this filter.
|
262 |
*
|
263 |
* @return the capabilities of this object
|
264 |
* @see Capabilities
|
265 |
*/
|
266 |
public Capabilities getCapabilities() {
|
267 |
Capabilities result = super.getCapabilities();
|
268 |
result.disableAll(); |
269 |
|
270 |
// attributes
|
271 |
result.enableAllAttributes(); |
272 |
result.enable(Capability.MISSING_VALUES); |
273 |
|
274 |
// class
|
275 |
result.enable(Capability.NOMINAL_CLASS); |
276 |
|
277 |
return result;
|
278 |
} |
279 |
|
280 |
|
281 |
|
282 |
|
283 |
|
284 |
|
285 |
|
286 |
|
287 |
|
288 |
/**
|
289 |
* Sets the format of the input instances.
|
290 |
*
|
291 |
* @param instanceInfo an Instances object containing the input
|
292 |
* instance structure (any instances contained in the object are
|
293 |
* ignored - only the structure is required).
|
294 |
* @return true if the outputFormat may be collected immediately
|
295 |
* @throws Exception if the input format can't be set
|
296 |
* successfully
|
297 |
*/
|
298 |
public boolean setInputFormat(Instances instanceInfo) |
299 |
throws Exception { |
300 |
|
301 |
super.setInputFormat(instanceInfo);
|
302 |
setOutputFormat(instanceInfo); |
303 |
return true; |
304 |
} |
305 |
|
306 |
|
307 |
|
308 |
|
309 |
|
310 |
|
311 |
|
312 |
|
313 |
|
314 |
/**
|
315 |
* Input an instance for filtering. Filter requires all
|
316 |
* training instances be read before producing output.
|
317 |
*
|
318 |
* @param instance the input instance
|
319 |
* @return true if the filtered instance may now be
|
320 |
* collected with output().
|
321 |
* @throws IllegalStateException if no input structure has been defined
|
322 |
*/
|
323 |
public boolean input(Instance instance) { |
324 |
|
325 |
if (getInputFormat() == null) { |
326 |
throw new IllegalStateException("No input instance format defined"); |
327 |
} |
328 |
if (m_NewBatch) {
|
329 |
resetQueue(); |
330 |
m_NewBatch = false;
|
331 |
} |
332 |
if (isFirstBatchDone()) {
|
333 |
push(instance); |
334 |
return true; |
335 |
} else {
|
336 |
bufferInput(instance); |
337 |
return false; |
338 |
} |
339 |
} |
340 |
|
341 |
|
342 |
|
343 |
|
344 |
|
345 |
|
346 |
|
347 |
|
348 |
|
349 |
|
350 |
|
351 |
|
352 |
|
353 |
|
354 |
|
355 |
/**
|
356 |
* creates the subsample without replacement.
|
357 |
*
|
358 |
* @param random the random number generator to use
|
359 |
* @param origSize the original size of the dataset
|
360 |
* @param sampleSize the size to generate
|
361 |
* @param actualClasses the number of classes found in the data
|
362 |
* @param classIndices the indices where classes start
|
363 |
*/
|
364 |
|
365 |
|
366 |
|
367 |
|
368 |
|
369 |
|
370 |
|
371 |
// Create the new sample
|
372 |
Random random = new Random(m_RandomSeed); |
373 |
|
374 |
|
375 |
|
376 |
|
377 |
public boolean batchFinished() { |
378 |
|
379 |
if (getInputFormat() == null) { |
380 |
throw new IllegalStateException("No input instance format defined"); |
381 |
} |
382 |
|
383 |
if (!isFirstBatchDone()) {
|
384 |
// Do the subsample, and clear the input instances.
|
385 |
createSubsample(); |
386 |
System.out.println("Seed is: "+ m_RandomSeed); |
387 |
} |
388 |
flushInput(); |
389 |
|
390 |
m_NewBatch = true;
|
391 |
m_FirstBatchDone = true;
|
392 |
return (numPendingOutput() != 0); |
393 |
} |
394 |
|
395 |
|
396 |
|
397 |
protected void createSubsample() { |
398 |
int origSize = getInputFormat().numInstances();
|
399 |
|
400 |
|
401 |
|
402 |
|
403 |
// Sort according to class attribute.
|
404 |
getInputFormat().sort(getInputFormat().classIndex()); |
405 |
|
406 |
// Create an index of where each class value starts
|
407 |
int[] classIndices = new int [getInputFormat().numClasses() + 1]; |
408 |
int currentClass = 0; |
409 |
classIndices[currentClass] = 0;
|
410 |
for (int i = 0; i < getInputFormat().numInstances(); i++) { |
411 |
Instance current = getInputFormat().instance(i); |
412 |
if (current.classIsMissing()) {
|
413 |
for (int j = currentClass + 1; j < classIndices.length; j++) { |
414 |
classIndices[j] = i; |
415 |
} |
416 |
break;
|
417 |
} else if (current.classValue() != currentClass) { |
418 |
for (int j = currentClass + 1; j <= current.classValue(); j++) { |
419 |
classIndices[j] = i; |
420 |
} |
421 |
currentClass = (int) current.classValue();
|
422 |
} |
423 |
} |
424 |
if (currentClass <= getInputFormat().numClasses()) {
|
425 |
for (int j = currentClass + 1; j < classIndices.length; j++) { |
426 |
classIndices[j] = getInputFormat().numInstances(); |
427 |
} |
428 |
} |
429 |
|
430 |
int actualClasses = 0; |
431 |
for (int i = 0; i < classIndices.length - 1; i++) { |
432 |
if (classIndices[i] != classIndices[i + 1]) { |
433 |
actualClasses++; |
434 |
} |
435 |
}} |
436 |
|
437 |
|
438 |
|
439 |
|
440 |
|
441 |
|
442 |
|
443 |
|
444 |
|
445 |
|
446 |
|
447 |
|
448 |
|
449 |
|
450 |
|
451 |
|
452 |
|
453 |
|
454 |
|
455 |
|
456 |
|
457 |
|
458 |
|
459 |
|
460 |
|
461 |
|
462 |
|
463 |
|
464 |
/**
|
465 |
* Returns the revision string.
|
466 |
*
|
467 |
* @return the revision
|
468 |
*/
|
469 |
public String getRevision() { |
470 |
return RevisionUtils.extract("$Revision: 5542 $"); |
471 |
} |
472 |
|
473 |
/**
|
474 |
* Main method for testing this class.
|
475 |
*
|
476 |
* @param argv should contain arguments to the filter:
|
477 |
* use -h for help
|
478 |
*/
|
479 |
public static void main(String [] argv) { |
480 |
runFilter(new Resample(), argv);
|
481 |
} |
482 |
} |