/*************************************************************************************
DBCountPreprocessor.java
Preprocesses data ready to calculate a count diamond. No measures.
Copyright (C) 2008 Hazel Webb hazel.webb@unb.ca
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
**/
/**
* DBCountPreprocessor.java
* input: fname: csv fact table
* d: number of dimensions
* [ n: normalized binary output]
* and/or [ s: hash table(Key, Count) character output]
*
* output: 1) array file first line indicates number of dims
* second line cardinalities for each dim
* remaining lines counts for each attr
* 2) binary file data stored as packed ints (no eol to indicate new fact)
* normalizes data starting from 0
*
* 3) hash table files suitable for reading by DBCount.java as an option
*
* It is expected that the output file will be used by DBCount_binary or DBCount to process for K.
*
* May 6, 2008
* changes made to standardize interface and provide optional output January 2009
* Hazel Webb
******************************************************************************************/
import java.util.*;
import java.io.*;
public class DBCountPreprocessor {
private int d;
//number of data dims
private static boolean v = false;
//noisy output
private String file;
//store input file name
private Vector[]data;
//data structure ragged array of Vectors to store counts for nomalized attributes
private Hashtable[] mappedCoords;
//ds maps attribute values to their counts
private Hashtable[] mappedNorms;
//ds helps to transform data to normalized integers
private static boolean norm = false;
//request normalized output
private static boolean hash = false;
//request hashtable character output
public static void main(String[] args) {
DBCountPreprocessor proc = new DBCountPreprocessor();
Hashtable hashFlags = new Hashtable();
//hardcode maps from flag strings to integers
hashFlags.put("-filename", new Integer(0));
hashFlags.put("-f", new Integer(0));
hashFlags.put("-d", new Integer(1));
hashFlags.put("-n", new Integer(2));
hashFlags.put("-s", new Integer(3));
hashFlags.put("-h", new Integer(4));
hashFlags.put("-v", new Integer(5));
if (args.length < 4) {
System.out.println("Usage: DBCountPreprocessor -f filename -d dims -n AND/OR -s [-v verbose]");
System.exit(1);
}
int iarg;
for (int a=0; a[]
for (int i = 0; i < proc.d; ++i)
proc.data[i] = new Vector();
proc.mappedCoords = new Hashtable[proc.d];
proc.mappedNorms = new Hashtable[proc.d];
break;
case 2:
norm = true;
break;
case 3:
hash = true;
break;
case 4:
System.out.println("Usage: DBCountPP_binary -f filename - d dims [-v verbose]");
System.exit(0);
break;
case 5:
v = true;
}
}
//timing starts
long startTime = System.currentTimeMillis();
proc.createHashTables(proc.file, proc.d);
// createHashTables method processes for normalized or character data by checking
// truth value of norm and hash
if (proc.hash)proc.writeHashTables(proc.file);
//only write the hash table files if requested
if (proc.norm) {
proc.writeArrays(proc.file);
//only write the normalized files and arrays if requested
proc.writeNormFile(proc.file, proc.file+".norm");
}
long endTime = System.currentTimeMillis();
double time = (endTime - startTime)/1000.0;
System.out.println("Time for preprocessing: " + time + " seconds");
// proc.checkFile(proc.file);
//sanity check only use on small files - it prints everything
}
/**
writeArrays
@param fname: String filename
write data to file 'fname.ext_array'
output file is used as input to DBCount_binary.java
format of output file is packed integers
numberOfDimensions dim0Size dim1Size ...dimnSize
attr counts for dim0; attr counts for dim1 .... attr counts for dimn
**/
public void writeArrays(String fname) {
fname = fname +"_array";
File file = null;
try {
file = new File(fname);
FileOutputStream file_output = new FileOutputStream(file);
// Wrap the FileOutputStream with a DataOutputStream
DataOutputStream data_out = new DataOutputStream(file_output);
// Write the data to the file
data_out.writeInt(d);
//number dims
for (int i = 0; i < d; ++i)
data_out.writeInt(mappedNorms[i].size());
//dim sizes
for (int i=0; i < d; i++) {
for (int j = 0; j < mappedNorms[i].size(); ++j)
data_out.writeInt(data[i].get(j));
}
// Close file when finished with it..
file_output.close();
//now make a back-up copy for use later when searching for kappa
Runtime rt = Runtime.getRuntime();
String cmd = "cp "+fname+" " + fname+ "_bak" ;
System.out.println("executing: " + cmd);
Process p = rt.exec(cmd);
} catch (IOException e) {
System.out.println ("IO exception = " + e );
}
}
/**
@param fname String file to read and print to standard out
verifies execution
prints values in name.array to standard out
WARNING: this method prints everything. Use with care!
**/
public void checkFile(String fname) {
if (norm) {
File file = null;
int i_data =0;
try {
file = new File(fname+"_array");
FileInputStream file_input = new FileInputStream (file);
DataInputStream data_in = new DataInputStream (file_input );
while (true) {
try {
i_data = data_in.readInt ();
} catch (EOFException eof) {
System.out.println ("End of File");
break;
}
// Print out the integers.
System.out.printf ("%3d \t", i_data );
}
data_in.close ();
} catch (IOException e) {
System.out.println ( "IO Exception =: " + e );
}
} else
System.out.println("Array option not chosen so validation not possible");
}
/**
createHashTables
@param fname String of the csv fact table to process.
CSV file should have no headers or metadata.
creates d hashtables of pairs
creates d hash tables of ,value(normvalue)> pairs
creates array of vectors that store counts for the normalized attributes
**/
public void createHashTables(String fname, int d) {
BufferedReader raw;
PrintWriter permanent_out;
// initialize 'em happens regardless of output choice...
for (int i = 0; i < d; i++) {
mappedCoords[i]=new Hashtable();
mappedNorms[i] = new Hashtable();
}
// map all of the attributes to their counts
try {
permanent_out = new PrintWriter(new BufferedWriter(new FileWriter(fname+"-string-to-norm")));
raw = new BufferedReader(new InputStreamReader(new FileInputStream(fname)));
String line = raw.readLine();
Integer val = new Integer(1);
Integer[] normVal = new Integer[d];
//ds to store the normalized value for the next key in each dimension
for (int i = 0; i < d; ++i)
normVal[i] = new Integer(0);
while (line!=null) {
if (v) {
System.out.println("Line read: "+line);
}
//ensure a comma at the end of the line so split will extract all the fields
if (!(line.charAt(line.length()-1)==','))
line = line + ",";
String[] atts = line.split(",");
for (int k = 0; k < d; ++k) {
String key = atts[k];
if (hash) {
if (!mappedCoords[k].containsKey(key)) {
//if it wasn't mapped yet
if (v) System.out.println("New Key: "+ key);
mappedCoords[k].put(key,val);
//add this key to mappedCoords hash
} else {
//increment count for this attribute
Integer intVal = (Integer)mappedCoords[k].get(key);
mappedCoords[k].put(key,intVal + val);
//increase count of this key in mappedCoords hash
}
}
if (norm) {
if (!mappedNorms[k].containsKey(key)) {
//if it wasn't mapped yet
if (v) System.out.println("New Key: "+ key);
mappedNorms[k].put(key,normVal[k]);
//add this key to mappedNorms hash and store the relationshiop permanently
permanent_out.println("dim: "+ k + "\t"+ key +"\t"+normVal[k].intValue());
data[k].add(val);
//add a place holder for counts in the appropriate vector
normVal[k] = (Integer)(normVal[k].intValue() + val.intValue());
//increase the current norm value for the next key seen
} else {
//increment count for this attribute
Integer count =(Integer) mappedNorms[k].get(key);
int accumulator = data[k].get(count);
data[k].set(count, accumulator+1);
//increase count of this key in the appropriate vector slot
}
}
}
line = raw.readLine();
}
raw.close();
permanent_out.close();
} catch (IOException ie) {
ie.printStackTrace();
}
if (v && hash) {
for (int i = 0; i < d; ++i)
System.out.println(mappedCoords[i]);
}
}
/**
writeHashTables
writes hashtables to file for future processing
java does not guarantee order of elements returned from a HashTable, so it is necessary
to run the (key,value) pairs through Enumerations in order to keep a persistent and accurate record
**/
public void writeHashTables(String fname) {
PrintWriter out, full_out;
try {
out = new PrintWriter(new BufferedWriter(new FileWriter(fname +"_COUNT")));
full_out = new PrintWriter(new BufferedWriter(new FileWriter(fname+"_COUNT.full")));
//write out metadata
//number of dimensions
out.print(d +"\n");
full_out.print(d+"\n");
//dimension sizes
for (int index = 0; index < d; ++index) {
out.print(index +" " +mappedCoords[index].size()+"\n");
full_out.print(index +" " +mappedCoords[index].size()+"\n");
}
//attributes and their counts
for (int index = 0; index < d; ++index) {
Enumeration values = mappedCoords[index].elements();
Enumeration keys = mappedCoords[index].keys();
while (keys.hasMoreElements()) {
out.print(keys.nextElement()+"\t");
out.print(values.nextElement());
out.println();
}
}
for (int index = 0; index < d; ++index) {
Enumeration values = mappedCoords[index].elements();
Enumeration keys = mappedCoords[index].keys();
while (keys.hasMoreElements()) {
full_out.print(keys.nextElement()+"\t");
full_out.print(values.nextElement());
full_out.println();
}
}
out.close();
full_out.close();
} catch (IOException ie) {
System.out.println("inside writeDims");
ie.printStackTrace();
}
}
/**writeNormFile
writes a normalized binary file of packed ints to writeFile
@param readFile: string name of file to read
@param writeFile: string name of file to write
**/
public void writeNormFile(String readFile, String writeFile) {
DataOutputStream pw;
PrintWriter writer;
BufferedReader buf;
try {
System.out.println("writing normalized file.....");
pw = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(writeFile))));
String normStringFileName = readFile.substring(0,readFile.lastIndexOf('.'))+"_normalized";
writer = new PrintWriter(new BufferedWriter(new FileWriter(normStringFileName)));
//write data
buf = new BufferedReader(new InputStreamReader(new FileInputStream(readFile)));
String line = buf.readLine();
while (line != null) {
if (!(line.charAt(line.length()-1)==','))
line = line +",";//ensure a comma at the end of the line
String[] fields= line.split(",");
for (int i = 0; i < d; ++i) {
pw.writeInt((Integer)mappedNorms[i].get(fields[i]));
//writing integers with no commas or linebreaks
writer.print(mappedNorms[i].get(fields[i])+",");
}
writer.println();
line = buf.readLine();
}
pw.close();
buf.close();
writer.close();
} catch (IOException ie) {
ie.printStackTrace();
}
}
}