JavaRanch » Java Forums »
Java »
Java in General
| Author |
Problem with this EmailExtractor
|
Maki Jav
Ranch Hand
Joined: May 09, 2002
Posts: 423
|
|
Problem with this EmailExtractor when using with Ms doc type files Hi, I have coded this email extractor which can be used with text, html, doc, files given the emails are written in it in abc@zxy.com,dbeahfs@yahoo.com,...... import java.awt.*; import javax.swing.*; import java.io.*; import java.awt.event.*; import java.sql.*; import java.util.*; public class EmailExtractor{ static char allowed[]={'a','b','c','d','e','f','g','h','i','j','k','l','m', 'n','o','p','q','r','s','t','u','v','w','x','y','z', 'A','B','C','D','E','F','G','H','I','J','K','L','M', 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z', '1','2','3','4','5','6','7','8','9','0','_','-' }; public static void main(String args[]){ long a=System.currentTimeMillis(); //emailslist=emailslist.toLowerCase(); //addEmail(emailslist); //************************************************************ Vector eaddress=new Vector(); String emailadd=""; String file =""; /* if file name is given at command prompt choose it then eg C:> java EmailExtractor efile.txt */ if(args.length>0)file=args[0]; /* otherwise choose default file */ if(file=="") file="email.txt"; File f= new File(file); if (f.canRead()){ try{ FileInputStream fis=new FileInputStream(f); InputStreamReader inr=new InputStreamReader(fis); BufferedReader br=new BufferedReader(inr); String readfile=br.readLine(); String temp=""; while(readfile!=null){ readfile=readfile.trim(); String email=""; StringTokenizer st=new StringTokenizer(readfile,",)('\">< "); if(readfile!=null) while (st.hasMoreTokens()) { email=st.nextToken(); email=email.trim(); int at=email.indexOf('@'); int dot=email.indexOf('.'); boolean accept=false; int attherat=0; int dots=0; // if dot comes after atleast one space after @ if(at>-1 && dot>at+1) for(int i=0;i<email.length()-1;i++){ // pick a character from char emc=email.charAt(i); // leave these alone if(emc=='.'){dots++;continue;} if(emc=='@'){attherat++;continue;} for (int ok=0;ok<allowed.length;ok++){ if(emc==allowed[ok]){accept=true;break;} }// for }//for if(accept && dots<3 && attherat==1) eaddress.add(email); } readfile=br.readLine(); } //fis.close(); //inr.close(); br.close(); }catch(Exception e){System.out.println("Main Method "+e);} } else { System.out.println("No File Provided!"); } //************************************************************* for(int i=0;i<eaddress.size();i++) System.out.println((1+i)+" "+eaddress.get(i)); System.out.println("DONE"); long b=System.currentTimeMillis(); System.out.println("Time Taken " +(b-a)+" milliseconds."); } } When I used it with doc type file, It is extracting three lines of the document formating information at the end too. Please help me with this so that it extracts emails only. Thanx in Advance, Maki Jav [ November 15, 2003: Message edited by: Maki Jav ]
|
Help gets you when you need it!
|
 |
 |
|
|
subject: Problem with this EmailExtractor
|
|
|
|