Java正则表达式提取网页url和链接文字
www.diybl.com 时间 : 2010-06-26 作者:佚名 编辑:壹枝雪糕 点击: [ 评论 ]
import java.net.*;
import java.io.*;
import java.util.regex.*;
public class Urls2
...{
String sourceURL;//需要采集的网页网址
String sourceContent;//网页页面内容
//String URLs; //采集到的超链接
//String title;//采集到的链接文字
String beginStr;//网页内容匹配区域开始字符串
String endStr;//网页内容匹配区域结束字符串
String matchContent;//网页内容匹配区域
public static void main(String[] args)
...{
Urls2 urls2=new Urls2("http://blog.sina.com.cn/lm/21/2006/0427/3.html","<body","</body>");
urls2.getSourceContent(urls2.sourceURL);
urls2.matchContent=urls2.getMatchContent(urls2.beginStr,urls2.endStr);
urls2.getString(urls2.matchContent);
}
public Urls2()
...{
}
//根据传来的网页网址、匹配区域起止字符串初始化
public Urls2(String sourceURL1,String beginStr1,String endStr1)
...{
sourceURL=sourceURL1;
beginStr=beginStr1;
endStr=endStr1;
}
//获取网页页面内容
public void getSourceContent(String URLStr)
...{
StringBuffer sb=new StringBuffer();
try
...{
URL newURL=new URL(URLStr);
BufferedReader br=new BufferedReader(
new InputStreamReader(newURL.openStream()));
String temp;
while((temp=br.readLine())!=null)
...{
sb.append(temp);
}
sourceContent=sb.toString();
}
catch(MalformedURLException e)
...{
e.printStackTrace();
}
catch(IOException e)
...{
e.printStackTrace();
}
}
//获取匹配区域
public String getMatchContent(String beginStr,String endStr)
...{
String regex=beginStr+".*?"+endStr;
//System.out.println(regex);
Pattern pt=Pattern.compile(regex);
Matcher mt=pt.matcher(sourceContent);
if(mt.find())
...{
return matchContent=mt.group();
}
else return null;
}
//获取需要的部分:超链接和标题
public void getString(String s)
...{
int counter=0;//计算器 计算匹配的个数
String regexURL="<a href="http://blog.sina.com.cn/s/.*?\.html".*?>.*?/a>";
Pattern pt=Pattern.compile(regexURL);
Matcher mt=pt.matcher(s);
while(mt.find())
...{
String s2=mt.group();
counter++;
System.out.println(mt.group());
//获取并打印标题
String titleRegex=">.*?</a>";
String title;
Matcher mt1=Pattern.compile(titleRegex).matcher(s2);
while(mt1.find())
...{
title=mt1.group().replaceAll(">|</a>|<font.*?>|</font>","");
System.out.println("标题:"+title);
}
//获取并打印网址
String urlsRegex="http://.*?\.html";
String urls;
Matcher mt2=Pattern.compile(urlsRegex).matcher(s2);
while(mt2.find())
...{
urls=mt2.group().replaceAll("<a href=|>","");
System.out.println("网址:"+urls);
}
System.out.println();//空行
}
System.out.println("共有"+counter+"个符合结果");
}
}