BigData: Reading a RSS feed

gravatar

John Minkjan

Based on the twitter feed reader I made a simple RSS feed reader using BIML. The Feed URI term is passed trough the variable RSSURI. Be aware not to poll the RSS feeder every second, it probably will exclude your IP. For most feeds once every hour is enough! The package will only retrieve “new” feed items which are not jet in your table.

published 10.06.12

last updated 10.06.12


Share

Tags

  • BigData
  • Lookup
  • ScriptComponent
  • ScriptComponentSource
  • Twitter
                            


<Biml xmlns="http://schemas.varigence.com/biml.xsd">
	<Annotations>
		<Annotation>
			File: RSS.Feed.Reader.biml
			Demo on reading a twitter feed
			Enviroment:
			DB: MS-SQL2012
			BIML: 1.6 VS2010 BIDS Helper
			(c) John Minkjan biml101.blogspot.com
		</Annotation>
		<Annotation>
			RSSFeedReader Table Create script:
			
			CREATE TABLE [dbo].[RSSFeedReader](
			[ChannelTitle] [nvarchar](255) NULL,
			[ChannelDescription] [nvarchar](2048) NULL,
			[ChannelLink] [nvarchar](255) NULL,
			[ChannelLanguage] [nvarchar](255) NULL,
			[ItemDescription] [nvarchar](2048) NULL,
			[ItemGuid] [nvarchar](128) NOT NULL,
			[ItemTitle] [nvarchar](255) NOT NULL,
			[ItemPubDate] [nvarchar](50) NOT NULL,
			[ItemLink] [nvarchar](255) NULL,
			[RSSURI] [nvarchar](255) NULL,
			CONSTRAINT [PK_RSSFeedReader] PRIMARY KEY CLUSTERED
			(
			[ItemGuid] ASC,
			[ItemTitle] ASC,
			[ItemPubDate] ASC
			)
			)
		</Annotation>
	</Annotations>
	<Connections>
		<OleDbConnection
	Name="CnOleDBAdventureWorks2012"
	ConnectionString="Data Source=.\MSSQL2012;Initial Catalog=AdventureWorks2012;Provider=SQLNCLI10.1;Integrated Security=SSPI;Connect Timeout=30;"
	RetainSameConnection="true">
		</OleDbConnection>
	</Connections>
	
	<ScriptProjects>
		<ScriptComponentProject Name="SC_RSS_READER">
			<AssemblyReferences>
				<AssemblyReference AssemblyPath="Microsoft.SqlServer.DTSPipelineWrap" />
				<AssemblyReference AssemblyPath="Microsoft.SqlServer.DTSRuntimeWrap" />
				<AssemblyReference AssemblyPath="Microsoft.SqlServer.PipelineHost" />
				<AssemblyReference AssemblyPath="Microsoft.SqlServer.TxScript" />
				<AssemblyReference AssemblyPath="System.dll" />
				<AssemblyReference AssemblyPath="System.AddIn.dll" />
				<AssemblyReference AssemblyPath="System.Data.dll" />
				<AssemblyReference AssemblyPath="System.Xml.dll" />
			</AssemblyReferences>
			<OutputBuffers>
				<OutputBuffer Name="Output0" IsSynchronous ="false">
					<Annotations>
						<Annotation>
							IsSynchronous="false" ==>
							When there are no non-synchronus buffers,
							the compiler does not emit the CreateNewOutputRows
							virtual base method (to match the BIDS/SSDT behavior)
						</Annotation>
					</Annotations>
					<Columns>
						<Column Name="ChannelTitle" DataType="String" Length="255"></Column> 
						<Column Name="ChannelDescription" DataType="String" Length="2048"></Column>
						<Column Name="ChannelLink" DataType="String" Length="255"></Column>
						<Column Name="ChannelLanguage" DataType="String" Length="255"></Column>
						<Column Name="ItemDescription" DataType="String" Length="2048"></Column>
						<Column Name="ItemGuid" DataType="String" Length="128"></Column>
						<Column Name="ItemTitle" DataType="String" Length="255"></Column>
						<Column Name="ItemPubDate" DataType="String" Length="50"></Column>
						<Column Name="ItemLink" DataType="String" Length="255"></Column>
						<Column Name="RSSURI" DataType="String" Length="255"></Column>
					</Columns>					
				</OutputBuffer>							     
			</OutputBuffers>
			<ReadOnlyVariables>				
				<Variable Namespace="User" VariableName="RSSURI" DataType="String"></Variable>
			</ReadOnlyVariables>
			<Files>
				<File Path="AssemblyInfo.cs">
					using System.Reflection;
					using System.Runtime.CompilerServices;
					[assembly: AssemblyTitle("SC_RSS_READER")]
					[assembly: AssemblyDescription("")]
					[assembly: AssemblyConfiguration("")]
					[assembly: AssemblyCompany("")]
					[assembly: AssemblyProduct("SC_RSS_READER")]
					[assembly: AssemblyCopyright("Copyright @  2012")]
					[assembly: AssemblyTrademark("")]
					[assembly: AssemblyCulture("")]
					[assembly: AssemblyVersion("1.0.*")]
				</File>
				<File Path="main.cs">
					<![CDATA[
					using System;
					using System.Data;
					using Microsoft.SqlServer.Dts.Pipeline.Wrapper;
					using Microsoft.SqlServer.Dts.Runtime.Wrapper;
					using System.Xml;

					[Microsoft.SqlServer.Dts.Pipeline.SSISScriptComponentEntryPointAttribute]
					public class ScriptMain : UserComponent
					{

					public override void CreateNewOutputRows()
					{
					XmlTextReader rssReader;
					XmlDocument rssDoc;
					XmlNode nodeRss;
					XmlNode nodeChannel;
					XmlNode nodeItem;

					string RSSURI = this.Variables.RSSURI.ToString();
					rssReader = new XmlTextReader(RSSURI);
					rssDoc = new XmlDocument();

					rssDoc.Load(rssReader);

					for (int i = 0; i < rssDoc.ChildNodes.Count; i++)
					{

					if (rssDoc.ChildNodes[i].Name == "rss")
					{
					nodeRss = rssDoc.ChildNodes[i];           

					for (int j = 0; j < nodeRss.ChildNodes.Count; j++)
					{
					if (nodeRss.ChildNodes[j].Name == "channel")
					{
					nodeChannel = nodeRss.ChildNodes[j];
                                               
					for (int k = 0; k < nodeChannel.ChildNodes.Count; k++)
					{

					if (nodeChannel.ChildNodes[k].Name == "item")
					{
					nodeItem = nodeChannel.ChildNodes[k];
					Output0Buffer.AddRow();
					Output0Buffer.ChannelTitle = nodeChannel["title"].InnerText;
					Output0Buffer.ChannelDescription = nodeChannel["description"].InnerText;
					Output0Buffer.ChannelLink = nodeChannel["link"].InnerText;
					Output0Buffer.ChannelLanguage = nodeChannel["language"].InnerText;
					Output0Buffer.ItemDescription = nodeItem["description"].InnerText;
					Output0Buffer.ItemGuid = nodeItem["guid"].InnerText;
					Output0Buffer.ItemTitle = nodeItem["title"].InnerText;
					Output0Buffer.ItemPubDate = nodeItem["pubDate"].InnerText;
					Output0Buffer.ItemLink = nodeItem["link"].InnerText;
					Output0Buffer.RSSURI = RSSURI;                               
					}
					}
					}
					}
					}
					}
					}

					}
					]]>
				</File>
			</Files>
		</ScriptComponentProject>
	</ScriptProjects>
	<Packages>
		<Package Name ="RSS.Feed.Reader" ConstraintMode="Linear">
			<Variables>
				<Variable Name="RSSURI" DataType="String">http://rss.cnn.com/rss/edition.rss</Variable>				
			</Variables>
			<Tasks>
				<Dataflow Name ="DFT Read RSS Feed" >
					<Transformations>
						<ScriptComponentSource Name="SC RSS Feed Reader">
							<ScriptComponentProjectReference ScriptComponentProjectName="SC_RSS_READER"></ScriptComponentProjectReference>
						</ScriptComponentSource>
						<Lookup Name="LKP Existing ItemGuids" 
								NoMatchBehavior="RedirectRowsToNoMatchOutput"
								OleDbConnectionName="CnOleDBAdventureWorks2012"  
								CacheMode="Partial">
							<ExternalTableInput Table="[dbo].[RSSFeedReader]"></ExternalTableInput>
							<Inputs>
								<Column SourceColumn="ItemGuid" TargetColumn="ItemGuid"></Column>
								<Column SourceColumn="ItemPubDate" TargetColumn="ItemPubDate"></Column>
								<Column SourceColumn="ItemTitle" TargetColumn="ItemTitle"></Column>
							</Inputs>
						</Lookup>					
						<OleDbDestination Name="ODD dbo RSSFeedReader" ConnectionName="CnOleDBAdventureWorks2012">
							<InputPath OutputPathName="LKP Existing ItemGuids.NoMatch"></InputPath>
							<ExternalTableOutput Table="[dbo].[RSSFeedReader]"></ExternalTableOutput>
						</OleDbDestination>
					</Transformations>					
				</Dataflow>
			</Tasks>
		</Package>
	</Packages>
</Biml>
                        

File: RSS.Feed.Reader.biml; Demo on reading a twitter feed; Enviroment: DB: MS-SQL2012 BIML: 1.6 VS2010 BIDS Helper; (c) John Minkjan biml101.blogspot.com; Also Published here:http://biml101.blogspot.com/2012/10/biml-bigdata-reading-rss-feed.html

You are not authorized to comment. A verification email has been sent to your email address. Please verify your account.

Comments

There are no comments yet.